def setup_experiment(new_version_accessions: List[str], old_version_accessions: List[str]) -> Dict: """ Create an experiment where some samples were processed with the newest version of salmon and other with an older one. """ # Create the experiment experiment_accession = "SRP095529" data_dir = "/home/user/data_store/" experiment_dir = data_dir + experiment_accession experiment = Experiment.objects.create(accession_code=experiment_accession, technology="RNA-SEQ") zebrafish = Organism.get_object_for_name("DANIO_RERIO") # Create the transcriptome processor and result: transcriptome_processor = Processor() transcriptome_processor.name = "Transcriptome" transcriptome_processor.version = "salmon 0.9.1" transcriptome_processor.docker_image = "dr_transcriptome" transcriptome_processor.environment = '{"some": "environment"}' transcriptome_processor.save() computational_result_short = ComputationalResult( processor=transcriptome_processor) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = zebrafish organism_index.result = computational_result_short organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT" organism_index.salmon_version = "salmon 0.9.1" organism_index.save() comp_file = ComputedFile() # This path will not be used because we already have the files extracted. comp_file.absolute_file_path = ( "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz") comp_file.result = computational_result_short comp_file.size_in_bytes = 1337 comp_file.sha1 = "ABC" comp_file.s3_key = "key" comp_file.s3_bucket = "bucket" comp_file.save() quant_processor = Processor() quant_processor.name = "Salmon Quant" quant_processor.version = "salmon 0.9.1" quant_processor.docker_image = "dr_salmon" quant_processor.environment = '{"some": "environment"}' quant_processor.save() for accession_code in old_version_accessions: sample = Sample.objects.create( accession_code=accession_code, organism=zebrafish, source_database="SRA", technology="RNA-SEQ", platform_accession_code="IlluminaHiSeq1000", ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample) original_file = OriginalFile() original_file.filename = accession_code + ".SRA" original_file.source_filename = accession_code + ".SRA" original_file.save() OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample) # Create and associate quant result and files. quant_result = ComputationalResult() quant_result.is_ccdl = True quant_result.processor = quant_processor quant_result.organism_index = organism_index # associate with OLD organism index quant_result.save() kv = ComputationalResultAnnotation() kv.data = {"index_length": "short"} kv.result = quant_result kv.is_public = True kv.save() # In prod the filename pattern will involve the timestamp # but here we're using the accession code so we can find # the archive file for the current sample. archive_filename = "result-" + accession_code + ".tar.gz" archive_file = ComputedFile() archive_file.filename = archive_filename archive_file.absolute_file_path = os.path.join(experiment_dir, archive_filename) archive_file.is_public = False archive_file.is_smashable = False archive_file.is_qc = False archive_file.result = quant_result archive_file.size_in_bytes = 12345 archive_file.save() quant_file = ComputedFile() quant_file.filename = "quant.sf" quant_file.absolute_file_path = (experiment_dir + "/quant_files/" + accession_code + "_output/quant.sf") quant_file.is_public = False quant_file.is_smashable = False quant_file.is_qc = False quant_file.result = quant_result quant_file.size_in_bytes = 12345 quant_file.s3_bucket = "bucket" quant_file.s3_key = "key" quant_file.save() SampleResultAssociation.objects.get_or_create(sample=sample, result=quant_result) # Create another OrganismIndex with a newer version of transcriptome_processor = Processor() transcriptome_processor.name = "Transcriptome" transcriptome_processor.version = "salmon 0.13.1" transcriptome_processor.docker_image = "dr_transcriptome" transcriptome_processor.environment = '{"some": "environment"}' transcriptome_processor.save() computational_result_short = ComputationalResult( processor=transcriptome_processor) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = zebrafish organism_index.result = computational_result_short organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT" organism_index.salmon_version = "salmon 0.13.1" # DIFFERENT SALMON VERSION organism_index.save() comp_file = ComputedFile() # This path will not be used because we already have the files extracted. comp_file.absolute_file_path = ( "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz") comp_file.result = computational_result_short comp_file.size_in_bytes = 1337 comp_file.sha1 = "ABC" comp_file.s3_key = "key" comp_file.s3_bucket = "bucket" comp_file.save() for accession_code in new_version_accessions: sample = Sample.objects.create( accession_code=accession_code, organism=zebrafish, source_database="SRA", technology="RNA-SEQ", platform_accession_code="IlluminaHiSeq1000", ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample) original_file = OriginalFile() original_file.filename = accession_code + ".SRA" original_file.source_filename = accession_code + ".SRA" original_file.save() OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample) # Create and associate quant result and files. quant_result = ComputationalResult() quant_result.is_ccdl = True quant_result.processor = quant_processor quant_result.organism_index = organism_index # NEWER VERSION quant_result.save() kv = ComputationalResultAnnotation() kv.data = {"index_length": "short"} kv.result = quant_result kv.is_public = True kv.save() # In prod the filename pattern will involve the timestamp # but here we're using the accession code so we can find # the archive file for the current sample. archive_filename = "result-" + accession_code + ".tar.gz" archive_file = ComputedFile() archive_file.filename = archive_filename archive_file.absolute_file_path = os.path.join(experiment_dir, archive_filename) archive_file.is_public = False archive_file.is_smashable = False archive_file.is_qc = False archive_file.result = quant_result archive_file.size_in_bytes = 12345 archive_file.save() quant_file = ComputedFile() quant_file.filename = "quant.sf" quant_file.absolute_file_path = (experiment_dir + "/quant_files/" + accession_code + "_output/quant.sf") quant_file.is_public = False quant_file.is_smashable = False quant_file.is_qc = False quant_file.result = quant_result quant_file.size_in_bytes = 12345 quant_file.s3_bucket = "bucket" quant_file.s3_key = "key" quant_file.save() SampleResultAssociation.objects.get_or_create(sample=sample, result=quant_result) return experiment
def _run_salmon(job_context: Dict) -> Dict: """Runs Salmon Quant.""" logger.debug("Running Salmon..") # Salmon needs to be run differently for different sample types. if "input_file_path_2" in job_context: second_read_str = " -2 {}".format(job_context["input_file_path_2"]) # Rob recommends 16 threads/process, which fits snugly on an x1 at 8GB RAM per Salmon container: # (2 threads/core * 16 cores/socket * 64 vCPU) / (1TB/8GB) = ~17 command_str = ("salmon --no-version-check quant -l A --biasSpeedSamp 5 -i {index}" " -1 {input_one}{second_read_str} -p 16 -o {output_directory}" " --seqBias --gcBias --dumpEq --writeUnmappedNames") formatted_command = command_str.format(index=job_context["index_directory"], input_one=job_context["input_file_path"], second_read_str=second_read_str, output_directory=job_context["output_directory"]) else: # Related: https://github.com/COMBINE-lab/salmon/issues/83 command_str = ("salmon --no-version-check quant -l A -i {index}" " -r {input_one} -p 16 -o {output_directory}" " --seqBias --dumpEq --writeUnmappedNames") formatted_command = command_str.format(index=job_context["index_directory"], input_one=job_context["input_file_path"], output_directory=job_context["output_directory"]) logger.debug("Running Salmon Quant using the following shell command: %s", formatted_command, processor_job=job_context["job_id"]) job_context['time_start'] = timezone.now() completed_command = subprocess.run(formatted_command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) job_context['time_end'] = timezone.now() ## To me, this looks broken: error codes are anything non-zero. ## However, Salmon (seems) to output with negative status codes ## even with successful executions. ## Possibly related: https://github.com/COMBINE-lab/salmon/issues/55 if completed_command.returncode == 1: stderr = completed_command.stderr.decode().strip() error_start = stderr.upper().find("ERROR:") error_start = error_start if error_start != -1 else 0 logger.error("Shell call to salmon failed with error message: %s", stderr[error_start:], processor_job=job_context["job_id"]) job_context["job"].failure_reason = ("Shell call to salmon failed because: " + stderr[error_start:]) job_context["success"] = False else: result = ComputationalResult() result.commands.append(formatted_command) result.time_start = job_context['time_start'] result.time_end = job_context['time_end'] result.organism_index = job_context["organism_index"] result.is_ccdl = True try: processor_key = "SALMON_QUANT" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) # Zip up the output of Salmon Quant try: with tarfile.open(job_context['output_archive'], "w:gz") as tar: tar.add(job_context["output_directory"], arcname=os.sep) except Exception: logger.exception("Exception caught while zipping processed directory %s", job_context["output_directory"], processor_job=job_context["job_id"] ) failure_template = "Exception caught while zipping processed directory {}" job_context["job"].failure_reason = failure_template.format(job_context['output_archive']) job_context["success"] = False return job_context salmon_quant_archive = ComputedFile() salmon_quant_archive.absolute_file_path = job_context["output_archive"] salmon_quant_archive.filename = os.path.split(job_context["output_archive"])[-1] salmon_quant_archive.calculate_sha1() salmon_quant_archive.calculate_size() salmon_quant_archive.is_public = True salmon_quant_archive.is_smashable = False salmon_quant_archive.is_qc = False quant_file = ComputedFile() quant_file.s3_bucket = S3_BUCKET_NAME quant_file.s3_key = "quant_files/sample_" + str(job_context["sample"].id) + "_quant.sf" quant_file.filename = "quant.sf" quant_file.absolute_file_path = job_context["output_directory"] + "quant.sf" quant_file.is_public = False quant_file.is_smashable = False quant_file.is_qc = False quant_file.calculate_sha1() quant_file.calculate_size() # If we're running in the cloud we need to upload the quant.sf # file so that it can be used by a job running on any machine # to run tximport. We can't use sync_to_s3 though because we # have to sync it before we can save the file so it cannot be # discovered by other jobs before it is uploaded. if settings.RUNNING_IN_CLOUD: try: S3.upload_file( quant_file.absolute_file_path, quant_file.s3_bucket, quant_file.s3_key, ExtraArgs={ 'ACL': 'public-read', 'StorageClass': 'STANDARD_IA' } ) except Exception as e: logger.exception(e, processor_job=job_context["job_id"], sample=job_context["sample"].id) failure_template = "Exception caught while uploading quantfile to S3: {}" job_context["job"].failure_reason = failure_template.format(quant_file.absolute_file_path) job_context["success"] = False return job_context # Here select_for_update() is used as a mutex that forces multiple # jobs to execute this block of code in serial manner. See: # https://docs.djangoproject.com/en/1.11/ref/models/querysets/#select-for-update # Theorectically any rows in any table can be locked here, we're # locking all existing rows in ComputationalResult table. with transaction.atomic(): ComputationalResult.objects.select_for_update() result.save() job_context["quant_result"] = result quant_file.result = result quant_file.save() job_context["result"] = result job_context['pipeline'].steps.append(result.id) SampleResultAssociation.objects.get_or_create(sample=job_context['sample'], result=result) salmon_quant_archive.result = result salmon_quant_archive.save() job_context['computed_files'].append(salmon_quant_archive) tximport_inputs = _get_tximport_inputs(job_context) # tximport analysis is done outside of the transaction so that # the mutex wouldn't hold the other jobs too long. for experiment, quant_files in tximport_inputs.items(): _tximport(job_context, experiment, quant_files) # If `tximport` on any related experiment fails, exit immediately. if not job_context["success"]: return job_context kv = ComputationalResultAnnotation() kv.data = {"index_length": job_context["index_length"]} kv.result = result kv.is_public = True kv.save() with open(os.path.join(job_context['output_directory'], 'lib_format_counts.json')) as lfc_file: format_count_data = json.load(lfc_file) kv = ComputationalResultAnnotation() kv.data = format_count_data kv.result = result kv.is_public = True kv.save() with open(os.path.join(job_context['output_directory'], 'aux_info', 'meta_info.json')) as mi_file: meta_info = json.load(mi_file) kv = ComputationalResultAnnotation() kv.data = meta_info kv.result = result kv.is_public = True kv.save() job_context["success"] = True return job_context
def _run_salmon(job_context: Dict) -> Dict: """Runs Salmon Quant.""" logger.debug("Running Salmon..") # Salmon needs to be run differently for different sample types. # SRA files also get processed differently as we don't want to use fasterq-dump to extract # them to disk. if job_context.get("sra_input_file_path", None): # Single reads if job_context["sra_num_reads"] == 1: fifo = "/tmp/barney" os.mkfifo(fifo) dump_str = "fastq-dump --stdout {input_sra_file} > {fifo} &" formatted_dump_command = dump_str.format( input_sra_file=job_context["sra_input_file_path"], fifo=fifo) subprocess.Popen(formatted_dump_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) command_str = ( "salmon --no-version-check quant -l A -i {index} " "-r {fifo} -p 16 -o {output_directory} --seqBias --dumpEq --writeUnmappedNames" ) formatted_command = command_str.format( index=job_context["index_directory"], input_sra_file=job_context["sra_input_file_path"], fifo=fifo, output_directory=job_context["output_directory"], ) # Paired are trickier else: # Okay, for some reason I can't explain, this only works in the temp directory, # otherwise the `tee` part will only output to one or the other of the streams (non-deterministically), # but not both. This doesn't appear to happen if the fifos are in tmp. alpha = "/tmp/alpha" os.mkfifo(alpha) beta = "/tmp/beta" os.mkfifo(beta) dump_str = "fastq-dump --stdout --split-files -I {input_sra_file} | tee >(grep '@.*\.1\s' -A3 --no-group-separator > {fifo_alpha}) >(grep '@.*\.2\s' -A3 --no-group-separator > {fifo_beta}) > /dev/null &" formatted_dump_command = dump_str.format( input_sra_file=job_context["sra_input_file_path"], fifo_alpha=alpha, fifo_beta=beta) subprocess.Popen( formatted_dump_command, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.STDOUT, ) command_str = ( "salmon --no-version-check quant -l A -i {index} " "-1 {fifo_alpha} -2 {fifo_beta} -p 16 -o {output_directory} --seqBias --dumpEq --writeUnmappedNames" ) formatted_command = command_str.format( index=job_context["index_directory"], input_sra_file=job_context["sra_input_file_path"], fifo_alpha=alpha, fifo_beta=beta, output_directory=job_context["output_directory"], ) else: if "input_file_path_2" in job_context: second_read_str = " -2 {}".format(job_context["input_file_path_2"]) # Rob recommends 16 threads/process, which fits snugly on an x1 at 8GB RAM per Salmon container: # (2 threads/core * 16 cores/socket * 64 vCPU) / (1TB/8GB) = ~17 command_str = ( "salmon --no-version-check quant -l A --biasSpeedSamp 5 -i {index}" " -1 {input_one}{second_read_str} -p 16 -o {output_directory}" " --seqBias --gcBias --dumpEq --writeUnmappedNames") formatted_command = command_str.format( index=job_context["index_directory"], input_one=job_context["input_file_path"], second_read_str=second_read_str, output_directory=job_context["output_directory"], ) else: # Related: https://github.com/COMBINE-lab/salmon/issues/83 command_str = ("salmon --no-version-check quant -l A -i {index}" " -r {input_one} -p 16 -o {output_directory}" " --seqBias --dumpEq --writeUnmappedNames") formatted_command = command_str.format( index=job_context["index_directory"], input_one=job_context["input_file_path"], output_directory=job_context["output_directory"], ) logger.debug( "Running Salmon Quant using the following shell command: %s", formatted_command, processor_job=job_context["job_id"], ) # Salmon probably shouldn't take longer than three hours. timeout = 60 * 60 * 3 job_context["time_start"] = timezone.now() try: completed_command = subprocess.run( formatted_command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout, ) except subprocess.TimeoutExpired: failure_reason = "Salmon timed out because it failed to complete within 3 hours." logger.error( failure_reason, sample_accesion_code=job_context["sample"].accession_code, processor_job=job_context["job_id"], ) job_context["job"].failure_reason = failure_reason job_context["job"].no_retry = True job_context["success"] = False return job_context job_context["time_end"] = timezone.now() if completed_command.returncode == 1: stderr = completed_command.stderr.decode().strip() error_start = stderr.upper().find("ERROR:") error_start = error_start if error_start != -1 else 0 logger.error( "Shell call to salmon failed with error message: %s", stderr[error_start:], processor_job=job_context["job_id"], ) # If salmon has an error exit code then we don't want to retry it. job_context["job"].no_retry = True job_context["job"].failure_reason = ( "Shell call to salmon failed because: " + stderr[error_start:]) job_context["success"] = False else: result = ComputationalResult() result.commands.append(formatted_command) result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] result.organism_index = job_context["organism_index"] result.is_ccdl = True try: processor_key = "SALMON_QUANT" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) # Zip up the output of Salmon Quant try: with tarfile.open(job_context["output_archive"], "w:gz") as tar: tar.add(job_context["output_directory"], arcname=os.sep) except Exception: logger.exception( "Exception caught while zipping processed directory %s", job_context["output_directory"], processor_job=job_context["job_id"], ) failure_template = "Exception caught while zipping processed directory {}" job_context["job"].failure_reason = failure_template.format( job_context["output_archive"]) job_context["success"] = False return job_context salmon_quant_archive = ComputedFile() salmon_quant_archive.absolute_file_path = job_context["output_archive"] salmon_quant_archive.filename = os.path.split( job_context["output_archive"])[-1] salmon_quant_archive.calculate_sha1() salmon_quant_archive.calculate_size() salmon_quant_archive.is_public = True salmon_quant_archive.is_smashable = False salmon_quant_archive.is_qc = False quant_file = ComputedFile() quant_file.s3_bucket = S3_BUCKET_NAME timestamp = str(timezone.now().timestamp()).split(".")[0] quant_file.s3_key = "quant_files/sample_{0}_{1}_quant.sf".format( job_context["sample"].id, timestamp) quant_file.filename = "quant.sf" quant_file.absolute_file_path = job_context[ "output_directory"] + "quant.sf" quant_file.is_public = False quant_file.is_smashable = False quant_file.is_qc = False quant_file.calculate_sha1() quant_file.calculate_size() # If we're running in the cloud we need to upload the quant.sf # file so that it can be used by a job running on any machine # to run tximport. We can't use sync_to_s3 though because we # have to sync it before we can save the file so it cannot be # discovered by other jobs before it is uploaded. if settings.RUNNING_IN_CLOUD: try: S3.upload_file( quant_file.absolute_file_path, quant_file.s3_bucket, quant_file.s3_key, ExtraArgs={ "ACL": "public-read", "StorageClass": "STANDARD_IA" }, ) except Exception as e: logger.exception(e, processor_job=job_context["job_id"], sample=job_context["sample"].id) failure_template = "Exception caught while uploading quantfile to S3: {}" job_context["job"].failure_reason = failure_template.format( quant_file.absolute_file_path) job_context["success"] = False return job_context # Here select_for_update() is used as a mutex that forces multiple # jobs to execute this block of code in serial manner. See: # https://docs.djangoproject.com/en/1.11/ref/models/querysets/#select-for-update # Theorectically any rows in any table can be locked here, we're # locking all existing rows in ComputationalResult table. with transaction.atomic(): ComputationalResult.objects.select_for_update() result.save() job_context["quant_result"] = result quant_file.result = result quant_file.save() job_context["result"] = result job_context["pipeline"].steps.append(result.id) SampleResultAssociation.objects.get_or_create( sample=job_context["sample"], result=result) salmon_quant_archive.result = result salmon_quant_archive.save() job_context["computed_files"].append(salmon_quant_archive) kv = ComputationalResultAnnotation() kv.data = { "index_length": job_context["index_length"], "index_length_get": job_context.get("index_length_raw", None), } kv.result = result kv.is_public = True kv.save() try: with open( os.path.join(job_context["output_directory"], "lib_format_counts.json")) as lfc_file: format_count_data = json.load(lfc_file) kv = ComputationalResultAnnotation() kv.data = format_count_data kv.result = result kv.is_public = True kv.save() except Exception: # See: https://github.com/AlexsLemonade/refinebio/issues/1167 logger.exception( "Error parsing Salmon lib_format_counts JSON output!", processor_job=job_context["job_id"], ) try: with open( os.path.join(job_context["output_directory"], "aux_info", "meta_info.json")) as mi_file: meta_info = json.load(mi_file) kv = ComputationalResultAnnotation() kv.data = meta_info kv.result = result kv.is_public = True kv.save() except Exception: # See: https://github.com/AlexsLemonade/refinebio/issues/1167 logger.exception("Error parsing Salmon meta_info JSON output!", processor_job=job_context["job_id"]) job_context["success"] = True return job_context
def prep_tximport_at_progress_point(complete_accessions: List[str], incomplete_accessions: List[str]) -> Dict: """Create an experiment and associated objects that tximport needs to run on it. Creates a sample for each accession contained in either input list. The samples in complete_accessions will be simlulated as already having salmon quant run on them. The samples in incomplete_accessions won't. """ # Create the experiment experiment_accession = "SRP095529" data_dir = "/home/user/data_store/" experiment_dir = data_dir + experiment_accession experiment = Experiment.objects.create(accession_code=experiment_accession, technology="RNA-SEQ") zebrafish = Organism.get_object_for_name("DANIO_RERIO") ExperimentOrganismAssociation.objects.get_or_create(experiment=experiment, organism=zebrafish) # Create the transcriptome processor and result: transcriptome_processor = Processor() transcriptome_processor.name = "Transcriptome" transcriptome_processor.version = "salmon 0.13.1" transcriptome_processor.docker_image = "dr_transcriptome" transcriptome_processor.environment = '{"some": "environment"}' transcriptome_processor.save() computational_result_short = ComputationalResult( processor=transcriptome_processor) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = zebrafish organism_index.result = computational_result_short organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT" organism_index.salmon_version = "salmon 0.13.1" organism_index.save() comp_file = ComputedFile() # This path will not be used because we already have the files extracted. comp_file.absolute_file_path = ( "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz") comp_file.result = computational_result_short comp_file.size_in_bytes = 1337 comp_file.sha1 = "ABC" comp_file.s3_key = "key" comp_file.s3_bucket = "bucket" comp_file.save() for accession_code in incomplete_accessions: sample = Sample.objects.create( accession_code=accession_code, organism=zebrafish, source_database="SRA", technology="RNA-SEQ", ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample) original_file = OriginalFile() original_file.filename = accession_code + ".SRA" original_file.source_filename = accession_code + ".SRA" original_file.save() OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample) quant_processor = Processor() quant_processor.name = "Salmon Quant" quant_processor.version = "salmon 0.13.1" quant_processor.docker_image = "dr_salmon" quant_processor.environment = '{"some": "environment"}' quant_processor.save() tximport_processor = Processor() tximport_processor.name = "Tximport" tximport_processor.version = "salmon 0.13.1" tximport_processor.docker_image = "dr_salmon" tximport_processor.environment = '{"some": "environment"}' tximport_processor.save() # Create the already processed samples along with their # ComputationalResults and ComputedFiles. They don't need # original files for this test because we aren't going to run # salmon quant on them. for accession_code in complete_accessions: sample = Sample.objects.create( accession_code=accession_code, organism=zebrafish, source_database="SRA", technology="RNA-SEQ", ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample) original_file = OriginalFile() original_file.filename = accession_code + ".SRA" original_file.source_filename = accession_code + ".SRA" original_file.save() OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample) # Create and associate quant result and files. quant_result = ComputationalResult() quant_result.is_ccdl = True quant_result.processor = quant_processor quant_result.organism_index = organism_index quant_result.save() kv = ComputationalResultAnnotation() kv.data = {"index_length": "short"} kv.result = quant_result kv.is_public = True kv.save() # In prod the filename pattern will involve the timestamp # but here we're using the accession code so we can find # the archive file for the current sample. archive_filename = "result-" + accession_code + ".tar.gz" archive_file = ComputedFile() archive_file.filename = archive_filename archive_file.absolute_file_path = os.path.join(experiment_dir, archive_filename) archive_file.is_public = False archive_file.is_smashable = False archive_file.is_qc = False archive_file.result = quant_result archive_file.size_in_bytes = 12345 archive_file.save() quant_file = ComputedFile() quant_file.filename = "quant.sf" quant_file.absolute_file_path = (experiment_dir + "/quant_files/" + accession_code + "_output/quant.sf") quant_file.is_public = False quant_file.is_smashable = False quant_file.is_qc = False quant_file.result = quant_result quant_file.size_in_bytes = 12345 quant_file.s3_bucket = "bucket" quant_file.s3_key = "key" quant_file.save() sample.most_recent_quant_file = quant_file sample.save() SampleResultAssociation.objects.get_or_create(sample=sample, result=quant_result)