def test_qn_management_command(self): """Test that the management command fires off and then does not create a job for an organism that does not have enough samples on the same platform.""" homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606) homo_sapiens.save() experiment = Experiment() experiment.accession_code = "12345" experiment.save() codes = ["1", "2", "3", "4", "5", "6"] # We don't have a 0.tsv for code in codes: sample = Sample() sample.accession_code = code sample.title = code sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS" sample.organism = homo_sapiens sample.technology = "MICROARRAY" sample.is_processed = True sample.save() cr = ComputationalResult() cr.save() computed_file = ComputedFile() computed_file.filename = code + ".tsv" computed_file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv" computed_file.size_in_bytes = int(code) computed_file.result = cr computed_file.is_smashable = True computed_file.save() scfa = SampleComputedFileAssociation() scfa.sample = sample scfa.computed_file = computed_file scfa.save() exsa = ExperimentSampleAssociation() exsa.experiment = experiment exsa.sample = sample exsa.save() out = StringIO() try: call_command("create_qn_target", organism="homo_sapiens", min=1, stdout=out) except SystemExit as e: # this is okay! pass stdout = out.getvalue() self.assertFalse("Target file" in stdout) # There's not enough samples available in this scenario so we # shouldn't have even made a processor job. self.assertEqual(ProcessorJob.objects.count(), 0)
def prepare_organism_indices(): c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") # This is a lie, but this image doesn't have the dependencies for TRANSCRIPTOME_INDEX computational_result_short = ComputationalResult(processor=utils.find_processor('SALMON_QUANT')) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = c_elegans organism_index.result = computational_result_short organism_index.absolute_directory_path = "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT" organism_index.save() comp_file = ComputedFile() # This path will not be used because we already have the files extracted. comp_file.absolute_file_path = "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT/celgans_short.tar.gz" comp_file.result = computational_result_short comp_file.size_in_bytes=1337 comp_file.sha1="ABC" comp_file.save() # This is a lie, but this image doesn't have the dependencies for TX_IMPORT computational_result_long = ComputationalResult(processor=utils.find_processor('SALMON_QUANT')) computational_result_long.save()
def test_qn_reference(self, mock_send_job): organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606) organism.save() experiment = Experiment() experiment.accession_code = "12345" experiment.save() for code in [str(i) for i in range(1, 401)]: sample = Sample() sample.accession_code = code sample.title = code sample.platform_name = f"Affymetrix {organism.name}" sample.platform_accession_code = f"A-MEXP-{organism.name}" sample.manufacturer = "AFFYMETRIX" sample.organism = organism sample.technology = "MICROARRAY" sample.is_processed = True sample.has_raw = True sample.save() cr = ComputationalResult() cr.save() computed_file = ComputedFile() computed_file.filename = code + ".tsv" computed_file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv" computed_file.size_in_bytes = int(code) computed_file.result = cr computed_file.is_smashable = True computed_file.save() scfa = SampleComputedFileAssociation() scfa.sample = sample scfa.computed_file = computed_file scfa.save() exsa = ExperimentSampleAssociation() exsa.experiment = experiment exsa.sample = sample exsa.save() # We need more than one organism for the tests, but can't # repeat accesion codes, so halfway through just change the organism. if int(code) == 200: organism = Organism(name="MUS_MUSCULUS", taxonomy_id=111) organism.save() # Setup is done, actually run the command. command = Command() command.handle(organisms="HOMO_SAPIENS,MUS_MUSCULUS") self.assertEqual(len(mock_send_job.mock_calls), 2) self.assertEqual(ProcessorJob.objects.count(), 2)
def test_fail(self): """ Test our ability to fail """ result = ComputationalResult() result.save() sample = Sample() sample.accession_code = 'XXX' sample.title = 'XXX' sample.organism = Organism.get_object_for_name("HOMO_SAPIENS") sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "NOT_REAL.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['XXX']} ds.aggregate_by = 'EXPERIMENT' ds.scale_by = 'MINMAX' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() dsid = ds.id job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() final_context = smasher.smash(job.pk, upload=False) ds = Dataset.objects.get(id=dsid) print(ds.failure_reason) print(final_context['dataset'].failure_reason) self.assertNotEqual(final_context['unsmashable_files'], [])
def get_organism_with_qn_target(): result = ComputationalResult() result.save() qn_target = ComputedFile() qn_target.filename = "danio_target.tsv" qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv" qn_target.is_qn_target = True qn_target.size_in_bytes = "12345" qn_target.sha1 = "aabbccddeeff" qn_target.result = result qn_target.save() danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1, qn_target=result) danio_rerio.save() return danio_rerio
def prepare_job(): # Create 10 job directories for i in range(JOBS): os.makedirs(LOCAL_ROOT_DIR + "/processor_job_" + str(i), exist_ok=True) # These live on prod volumes at locations such as: # /var/ebs/SRP057116/SRR1972985/SRR1972985.sra os.makedirs(LOCAL_ROOT_DIR + "/SRP" + str(i), exist_ok=True) os.makedirs(LOCAL_ROOT_DIR + "/SRP" + str(i) + "/SRR" + str(i), exist_ok=True) sample = Sample() sample.accession_code = "SRR" + str(i) sample.save() cr = ComputationalResult() cr.save() cf = ComputedFile() cf.result = cr cf.size_in_bytes = 666 cf.save() scfa = SampleComputedFileAssociation() scfa.sample = sample scfa.computed_file = cf scfa.save() # Create a job out of the range with index in it to make sure we # don't delete index directories since that's where transcriptome # indices get downloaded to. os.makedirs(LOCAL_ROOT_DIR + "/processor_job_" + str(JOBS + 1) + "_index", exist_ok=True) os.makedirs(LOCAL_ROOT_DIR + "/SRP" + str(JOBS + 1) + "/SRR" + str(JOBS + 1), exist_ok=True) sample = Sample() sample.accession_code = "SRR" + str(JOBS + 1) sample.save() # Save two jobs so that we trigger two special circumstances, one # where the job is still running and the other where the job isn't # in Batch anymore. pj = ProcessorJob() pj.pipeline_applied = "SALMON" pj.batch_job_id = "running_job" pj.save() pj = ProcessorJob() pj.pipeline_applied = "SALMON" pj.batch_job_id = "missing_job" pj.save() pj = ProcessorJob() pj.pipeline_applied = "JANITOR" pj.save() return pj
def prepare_experiment(ids: List[int]) -> Experiment: (homo_sapiens, _) = Organism.objects.get_or_create(name="HOMO_SAPIENS", taxonomy_id=9606) experiment = Experiment() experiment.accession_code = "12345" experiment.save() codes = [str(i) for i in ids] for code in codes: sample = Sample() sample.accession_code = code sample.title = code sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS" sample.organism = homo_sapiens sample.technology = "MICROARRAY" sample.is_processed = True sample.save() cr = ComputationalResult() cr.save() computed_file = ComputedFile() computed_file.filename = code + ".tsv" computed_file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv" computed_file.size_in_bytes = int(code) computed_file.result = cr computed_file.is_smashable = True computed_file.save() scfa = SampleComputedFileAssociation() scfa.sample = sample scfa.computed_file = computed_file scfa.save() exsa = ExperimentSampleAssociation() exsa.experiment = experiment exsa.sample = sample exsa.save()
def create_sample_for_experiment(sample_info: Dict, experiment: Experiment) -> Sample: result = ComputationalResult() result.save() sample = Sample() sample.accession_code = sample_info["accession_code"] sample.title = sample_info.get("title", None) or sample_info["accession_code"] sample.organism = sample_info["organism"] sample.technology = sample_info["technology"] sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() if sample_info.get("filename") is not None: computed_file = ComputedFile() computed_file.filename = sample_info["filename"] computed_file.absolute_file_path = sample_info[ "data_dir"] + sample_info["filename"] computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() return sample
def setup_experiment(new_version_accessions: List[str], old_version_accessions: List[str]) -> Dict: """ Create an experiment where some samples were processed with the newest version of salmon and other with an older one. """ # Create the experiment experiment_accession = "SRP095529" data_dir = "/home/user/data_store/" experiment_dir = data_dir + experiment_accession experiment = Experiment.objects.create(accession_code=experiment_accession, technology="RNA-SEQ") zebrafish = Organism.get_object_for_name("DANIO_RERIO") # Create the transcriptome processor and result: transcriptome_processor = Processor() transcriptome_processor.name = "Transcriptome" transcriptome_processor.version = "salmon 0.9.1" transcriptome_processor.docker_image = "dr_transcriptome" transcriptome_processor.environment = '{"some": "environment"}' transcriptome_processor.save() computational_result_short = ComputationalResult( processor=transcriptome_processor) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = zebrafish organism_index.result = computational_result_short organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT" organism_index.salmon_version = "salmon 0.9.1" organism_index.save() comp_file = ComputedFile() # This path will not be used because we already have the files extracted. comp_file.absolute_file_path = ( "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz") comp_file.result = computational_result_short comp_file.size_in_bytes = 1337 comp_file.sha1 = "ABC" comp_file.s3_key = "key" comp_file.s3_bucket = "bucket" comp_file.save() quant_processor = Processor() quant_processor.name = "Salmon Quant" quant_processor.version = "salmon 0.9.1" quant_processor.docker_image = "dr_salmon" quant_processor.environment = '{"some": "environment"}' quant_processor.save() for accession_code in old_version_accessions: sample = Sample.objects.create( accession_code=accession_code, organism=zebrafish, source_database="SRA", technology="RNA-SEQ", platform_accession_code="IlluminaHiSeq1000", ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample) original_file = OriginalFile() original_file.filename = accession_code + ".SRA" original_file.source_filename = accession_code + ".SRA" original_file.save() OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample) # Create and associate quant result and files. quant_result = ComputationalResult() quant_result.is_ccdl = True quant_result.processor = quant_processor quant_result.organism_index = organism_index # associate with OLD organism index quant_result.save() kv = ComputationalResultAnnotation() kv.data = {"index_length": "short"} kv.result = quant_result kv.is_public = True kv.save() # In prod the filename pattern will involve the timestamp # but here we're using the accession code so we can find # the archive file for the current sample. archive_filename = "result-" + accession_code + ".tar.gz" archive_file = ComputedFile() archive_file.filename = archive_filename archive_file.absolute_file_path = os.path.join(experiment_dir, archive_filename) archive_file.is_public = False archive_file.is_smashable = False archive_file.is_qc = False archive_file.result = quant_result archive_file.size_in_bytes = 12345 archive_file.save() quant_file = ComputedFile() quant_file.filename = "quant.sf" quant_file.absolute_file_path = (experiment_dir + "/quant_files/" + accession_code + "_output/quant.sf") quant_file.is_public = False quant_file.is_smashable = False quant_file.is_qc = False quant_file.result = quant_result quant_file.size_in_bytes = 12345 quant_file.s3_bucket = "bucket" quant_file.s3_key = "key" quant_file.save() SampleResultAssociation.objects.get_or_create(sample=sample, result=quant_result) # Create another OrganismIndex with a newer version of transcriptome_processor = Processor() transcriptome_processor.name = "Transcriptome" transcriptome_processor.version = "salmon 0.13.1" transcriptome_processor.docker_image = "dr_transcriptome" transcriptome_processor.environment = '{"some": "environment"}' transcriptome_processor.save() computational_result_short = ComputationalResult( processor=transcriptome_processor) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = zebrafish organism_index.result = computational_result_short organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT" organism_index.salmon_version = "salmon 0.13.1" # DIFFERENT SALMON VERSION organism_index.save() comp_file = ComputedFile() # This path will not be used because we already have the files extracted. comp_file.absolute_file_path = ( "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz") comp_file.result = computational_result_short comp_file.size_in_bytes = 1337 comp_file.sha1 = "ABC" comp_file.s3_key = "key" comp_file.s3_bucket = "bucket" comp_file.save() for accession_code in new_version_accessions: sample = Sample.objects.create( accession_code=accession_code, organism=zebrafish, source_database="SRA", technology="RNA-SEQ", platform_accession_code="IlluminaHiSeq1000", ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample) original_file = OriginalFile() original_file.filename = accession_code + ".SRA" original_file.source_filename = accession_code + ".SRA" original_file.save() OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample) # Create and associate quant result and files. quant_result = ComputationalResult() quant_result.is_ccdl = True quant_result.processor = quant_processor quant_result.organism_index = organism_index # NEWER VERSION quant_result.save() kv = ComputationalResultAnnotation() kv.data = {"index_length": "short"} kv.result = quant_result kv.is_public = True kv.save() # In prod the filename pattern will involve the timestamp # but here we're using the accession code so we can find # the archive file for the current sample. archive_filename = "result-" + accession_code + ".tar.gz" archive_file = ComputedFile() archive_file.filename = archive_filename archive_file.absolute_file_path = os.path.join(experiment_dir, archive_filename) archive_file.is_public = False archive_file.is_smashable = False archive_file.is_qc = False archive_file.result = quant_result archive_file.size_in_bytes = 12345 archive_file.save() quant_file = ComputedFile() quant_file.filename = "quant.sf" quant_file.absolute_file_path = (experiment_dir + "/quant_files/" + accession_code + "_output/quant.sf") quant_file.is_public = False quant_file.is_smashable = False quant_file.is_qc = False quant_file.result = quant_result quant_file.size_in_bytes = 12345 quant_file.s3_bucket = "bucket" quant_file.s3_key = "key" quant_file.save() SampleResultAssociation.objects.get_or_create(sample=sample, result=quant_result) return experiment
def _create_result_objects(job_context: Dict) -> Dict: """ Store and host the result as a ComputationalResult object. """ result_start = log_state("start create result object", job_context["job"].id) result = ComputationalResult() result.commands.append(" ".join(job_context["formatted_command"])) result.is_ccdl = True # Temporary until we re-enable the QN test step. result.is_public = False result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "CREATE_COMPENDIA" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() # Write the compendia dataframe to a file job_context["csv_outfile"] = job_context["output_dir"] + job_context[ "organism_name"] + ".tsv" job_context["merged_qn"].to_csv(job_context["csv_outfile"], sep="\t", encoding="utf-8") organism_key = list(job_context["samples"].keys())[0] annotation = ComputationalResultAnnotation() annotation.result = result annotation.data = { "organism_id": job_context["samples"][organism_key][0].organism_id, "organism_name": job_context["organism_name"], "is_qn": False, "is_compendia": True, "samples": [ sample.accession_code for sample in job_context["samples"][organism_key] ], "num_samples": len(job_context["samples"][organism_key]), "experiment_accessions": [e.accession_code for e in job_context["experiments"]], "total_percent_imputed": job_context["total_percent_imputed"], } annotation.save() # Create the resulting archive final_zip_base = SMASHING_DIR + str( job_context["dataset"].pk) + "_compendia" # Copy LICENSE.txt and correct README.md files. if job_context["dataset"].quant_sf_only: readme_file = "/home/user/README_QUANT.md" else: readme_file = "/home/user/README_NORMALIZED.md" shutil.copy(readme_file, job_context["output_dir"] + "/README.md") shutil.copy("/home/user/LICENSE_DATASET.txt", job_context["output_dir"] + "/LICENSE.TXT") archive_path = shutil.make_archive(final_zip_base, "zip", job_context["output_dir"]) archive_computed_file = ComputedFile() archive_computed_file.absolute_file_path = archive_path archive_computed_file.filename = archive_path.split("/")[-1] archive_computed_file.calculate_sha1() archive_computed_file.calculate_size() archive_computed_file.is_smashable = False archive_computed_file.is_qn_target = False archive_computed_file.result = result archive_computed_file.save() # Compendia Result Helpers primary_organism = Organism.get_object_for_name( job_context["primary_organism"]) organisms = [ Organism.get_object_for_name(organism) for organism in job_context["all_organisms"] ] compendium_version = (CompendiumResult.objects.filter( primary_organism=primary_organism, quant_sf_only=False).count() + 1) # Save Compendia Result compendium_result = CompendiumResult() compendium_result.quant_sf_only = job_context["dataset"].quant_sf_only compendium_result.svd_algorithm = job_context["dataset"].svd_algorithm compendium_result.compendium_version = compendium_version compendium_result.result = result compendium_result.primary_organism = primary_organism compendium_result.save() # create relations to all organisms contained in the compendia compendium_result_organism_associations = [] for compendium_organism in organisms: compendium_result_organism_association = CompendiumResultOrganismAssociation( ) compendium_result_organism_association.compendium_result = compendium_result compendium_result_organism_association.organism = compendium_organism compendium_result_organism_associations.append( compendium_result_organism_association) CompendiumResultOrganismAssociation.objects.bulk_create( compendium_result_organism_associations) job_context["compendium_result"] = compendium_result logger.info("Compendium created!", archive_path=archive_path, organism_name=job_context["organism_name"]) # Upload the result to S3 timestamp = str(int(time.time())) key = job_context["organism_name"] + "_" + str( compendium_version) + "_" + timestamp + ".zip" uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, key) if not uploaded_to_s3: raise utils.ProcessorJobError( "Failed to upload compendia to S3", success=False, computed_file_id=archive_computed_file.id, ) if settings.RUNNING_IN_CLOUD: archive_computed_file.delete_local_file() job_context["result"] = result job_context["success"] = True log_state("end create result object", job_context["job"].id, result_start) # TEMPORARY for iterating on compendia more quickly. # Reset this so the end_job does clean up the job's non-input-data stuff. job_context["work_dir"] = job_context["old_work_dir"] return job_context
def test_qn_reference(self): job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606) homo_sapiens.save() experiment = Experiment() experiment.accession_code = "12345" experiment.save() # We don't have a 0.tsv codes = [str(i) for i in range(1, 201)] for code in codes: sample = Sample() sample.accession_code = code sample.title = code sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS" sample.organism = homo_sapiens sample.technology = "MICROARRAY" sample.is_processed = True sample.save() cr = ComputationalResult() cr.save() computed_file = ComputedFile() computed_file.filename = code + ".tsv" computed_file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv" computed_file.size_in_bytes = int(code) computed_file.result = cr computed_file.is_smashable = True computed_file.save() scfa = SampleComputedFileAssociation() scfa.sample = sample scfa.computed_file = computed_file scfa.save() exsa = ExperimentSampleAssociation() exsa.experiment = experiment exsa.sample = sample exsa.save() dataset = Dataset() dataset.data = {"12345": ["1", "2", "3", "4", "5", "6"]} dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False # We don't QN because we're creating the target now dataset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() final_context = qn_reference.create_qn_reference(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["target_file"])) self.assertEqual(os.path.getsize(final_context["target_file"]), 562) homo_sapiens.refresh_from_db() target = homo_sapiens.qn_target.computedfile_set.latest() self.assertEqual(target.sha1, "de69d348f8b239479e2330d596c4013a7b0b2b6a") # Create and run a smasher job that will use the QN target we just made. pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() ds = Dataset() ds.data = {"12345": ["1", "2", "3", "4", "5"]} ds.aggregate_by = "SPECIES" ds.scale_by = "STANDARD" ds.email_address = "*****@*****.**" ds.quantile_normalize = True ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(final_context["success"]) np.testing.assert_almost_equal(final_context["merged_qn"]["1"][0], -0.4379488527774811) np.testing.assert_almost_equal(final_context["original_merged"]["1"][0], -0.5762109)
def test_get_synced_files(self): """ """ result = ComputationalResult() result.save() computed_file = ComputedFile() computed_file.s3_key = "all_the_things.jpg" computed_file.s3_bucket = "data-refinery-test-assets" computed_file.filename = "all_the_things.jpg" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 9001 computed_file.is_smashable = False computed_file.sha1 = "36cf21c08d461f74ddb0f2edb6257afee309c4a4" computed_file.save() # Make sure it's not there try: os.remove("/home/user/data_store/PCL/" + computed_file.filename) except OSError: pass # We do this twice, once to get from S3 and once to get from local disk. afp = computed_file.get_synced_file_path(force=True) self.assertTrue(os.path.exists(afp)) afp = computed_file.get_synced_file_path(force=True) self.assertTrue(os.path.exists(afp))
def _create_result_objects(job_context: Dict) -> Dict: result = ComputationalResult() result.commands.append(" ".join(job_context['formatted_command'])) result.is_ccdl = True result.is_public = True result.time_start = job_context['time_start'] result.time_end = job_context['time_end'] try: processor_key = "QN_REFERENCE" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() computed_file = ComputedFile() computed_file.absolute_file_path = job_context['target_file'] computed_file.filename = job_context['target_file'].split('/')[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.is_smashable = False computed_file.is_qn_target = True computed_file.result = result computed_file.save() annotation = ComputationalResultAnnotation() annotation.result = result annotation.data = { "organism_id": job_context['samples']['ALL'][0].organism_id, "is_qn": True, "platform_accession_code": job_context['samples']['ALL'][0].platform_accession_code, "samples": [sample.accession_code for sample in job_context["samples"]["ALL"]], "geneset": str(job_context["geneset"]), "num_valid_inputs": job_context["num_valid_inputs"] } annotation.save() # TODO: upload this to a public read bucket. # https://github.com/AlexsLemonade/refinebio/issues/586 job_context['result'] = result job_context['computed_files'] = [computed_file] job_context['annotation'] = annotation job_context['success'] = True return job_context
def _create_result_objects(job_context: Dict) -> Dict: """ Store and host the result as a ComputationalResult object. """ result = ComputationalResult() result.commands.append(" ".join(job_context['formatted_command'])) result.is_ccdl = True result.is_public = True result.time_start = job_context['time_start'] result.time_end = job_context['time_end'] try: processor_key = "COMPENDIA" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() # Write the compendia dataframe to a file, overwriting the previous smash job_context['merged_qn'].to_csv(job_context['smash_outfile'], sep='\t', encoding='utf-8') compendia_tsv_computed_file = ComputedFile() compendia_tsv_computed_file.absolute_file_path = job_context['smash_outfile'] compendia_tsv_computed_file.filename = job_context['smash_outfile'].split('/')[-1] compendia_tsv_computed_file.calculate_sha1() compendia_tsv_computed_file.calculate_size() compendia_tsv_computed_file.is_smashable = False compendia_tsv_computed_file.is_qn_target = False compendia_tsv_computed_file.result = result compendia_tsv_computed_file.save() organism_key = list(job_context['samples'].keys())[0] annotation = ComputationalResultAnnotation() annotation.result = result annotation.data = { "organism_id": job_context['samples'][organism_key][0].organism_id, "organism_name": job_context['samples'][organism_key][0].organism.name, "is_qn": False, "is_compendia": True, "samples": [sample.accession_code for sample in job_context["samples"][organism_key]], "num_samples": len(job_context["samples"][organism_key]), "experiment_accessions": [e.accession_code for e in job_context['experiments']] } annotation.save() # Save the related metadata file metadata_computed_file = ComputedFile() metadata_computed_file.absolute_file_path = job_context['metadata_tsv_paths'][0] metadata_computed_file.filename = job_context['metadata_tsv_paths'][0].split('/')[-1] metadata_computed_file.calculate_sha1() metadata_computed_file.calculate_size() metadata_computed_file.is_smashable = False metadata_computed_file.is_qn_target = False metadata_computed_file.result = result metadata_computed_file.save() # Create the resulting archive final_zip_base = "/home/user/data_store/smashed/" + str(job_context["dataset"].pk) + "_compendia" archive_path = shutil.make_archive(final_zip_base, 'zip', job_context["output_dir"]) # Save the related metadata file organism = job_context['samples'][organism_key][0].organism try: last_compendia = ComputedFile.objects.filter( is_compendia=True, compendia_organism=organism).order_by('-compendia_version')[-1] compendia_version = last_compendia.compendia_version + 1 except Exception as e: # This is the first compendia for this Organism compendia_version = 1 archive_computed_file = ComputedFile() archive_computed_file.absolute_file_path = archive_path archive_computed_file.filename = archive_path.split('/')[-1] archive_computed_file.calculate_sha1() archive_computed_file.calculate_size() archive_computed_file.is_smashable = False archive_computed_file.is_qn_target = False archive_computed_file.result = result archive_computed_file.is_compendia = True archive_computed_file.compendia_organism = job_context['samples'][organism_key][0].organism archive_computed_file.compendia_version = compendia_version archive_computed_file.save() logger.info("Compendia created!", archive_path=archive_path, organism_name=job_context['samples'][organism_key][0].organism.name ) # Upload the result to S3 key = job_context['samples'][organism_key][0].organism.name + "_" + str(compendia_version) + "_" + str(int(time.time())) + ".zip" archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, key) job_context['result'] = result job_context['computed_files'] = [compendia_tsv_computed_file, metadata_computed_file, archive_computed_file] job_context['success'] = True return job_context
def test_dualtech_smash(self): """ """ pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1487313" experiment.save() result = ComputationalResult() result.save() gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS") sample = Sample() sample.accession_code = 'GSM1487313' sample.title = 'GSM1487313' sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487313_liver.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # RNASEQ TECH experiment2 = Experiment() experiment2.accession_code = "SRS332914" experiment2.save() result2 = ComputationalResult() result2.save() sample2 = Sample() sample2.accession_code = 'SRS332914' sample2.title = 'SRS332914' sample2.organism = gallus_gallus sample2.technology = "RNA-SEQ" sample2.save() sra2 = SampleResultAssociation() sra2.sample = sample2 sra2.result = result2 sra2.save() esa2 = ExperimentSampleAssociation() esa2.experiment = experiment2 esa2.sample = sample2 esa2.save() computed_file2 = ComputedFile() computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv" computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename computed_file2.result = result2 computed_file2.size_in_bytes = 234 computed_file2.is_smashable = True computed_file2.save() assoc2 = SampleComputedFileAssociation() assoc2.sample = sample2 assoc2.computed_file = computed_file2 assoc2.save() # CROSS-SMASH BY SPECIES ds = Dataset() ds.data = {'GSE1487313': ['GSM1487313'], 'SRX332914': ['SRS332914']} ds.aggregate_by = 'SPECIES' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() self.assertTrue(ds.is_cross_technology()) final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(os.path.exists(final_context['output_file'])) os.remove(final_context['output_file']) self.assertEqual(len(final_context['final_frame'].columns), 2) # THEN BY EXPERIMENT ds.aggregate_by = 'EXPERIMENT' ds.save() dsid = ds.id ds = Dataset.objects.get(id=dsid) pj.start_time = None pj.end_time = None pj.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(os.path.exists(final_context['output_file'])) os.remove(final_context['output_file']) self.assertEqual(len(final_context['final_frame'].columns), 1) # THEN BY ALL ds.aggregate_by = 'ALL' ds.save() dsid = ds.id ds = Dataset.objects.get(id=dsid) pj.start_time = None pj.end_time = None pj.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(os.path.exists(final_context['output_file'])) self.assertEqual(len(final_context['final_frame'].columns), 2)
def _create_result_objects(job_context: Dict) -> Dict: """ Store and host the result as a ComputationalResult object. """ archive_path = job_context["archive_path"] compendia_organism = _get_organisms(job_context["samples"]).first() compendia_version = _get_next_compendia_version(compendia_organism) result = ComputationalResult() result.commands.append(" ".join(job_context["formatted_command"])) result.is_ccdl = True result.is_public = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "CREATE_QUANTPENDIA" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() archive_computed_file = ComputedFile() archive_computed_file.absolute_file_path = archive_path archive_computed_file.filename = FileUtils.get_filename(archive_path) archive_computed_file.calculate_sha1() archive_computed_file.calculate_size() archive_computed_file.is_smashable = False archive_computed_file.is_qn_target = False archive_computed_file.result = result archive_computed_file.is_compendia = True archive_computed_file.quant_sf_only = True archive_computed_file.compendia_organism = compendia_organism archive_computed_file.compendia_version = compendia_version archive_computed_file.save() compendium_result = CompendiumResult() compendium_result.quant_sf_only = True compendium_result.result = result compendium_result.primary_organism = compendia_organism compendium_result.compendium_version = compendia_version compendium_result.save() logger.info( "Quantpendia created! Uploading to S3.", job_id=job_context["job_id"], archive_path=archive_path, organism_name=compendia_organism.name, **get_process_stats() ) # Upload the result to S3 timestamp = str(int(time.time())) s3_key = compendia_organism.name + "_" + str(compendia_version) + "_" + timestamp + ".zip" uploaded_to_s3 = archive_computed_file.sync_to_s3(S3_COMPENDIA_BUCKET_NAME, s3_key) if not uploaded_to_s3: raise utils.ProcessorJobError( "Failed to upload compendia to S3", success=False, computed_file_id=archive_computed_file.id, ) if settings.RUNNING_IN_CLOUD: archive_computed_file.delete_local_file() job_context["result"] = result job_context["success"] = True return job_context
def test_no_smash_dupe_two(self): """ Tests the SRP051449 case, where the titles collide. Also uses a real QN target file.""" job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() experiment = Experiment() experiment.accession_code = "SRP051449" experiment.save() result = ComputationalResult() result.save() danio_rerio = Organism.get_object_for_name("DANIO_RERIO") sample = Sample() sample.accession_code = 'SRR1731761' sample.title = 'Danio rerio' sample.organism = danio_rerio sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "SRR1731761_output_gene_lengthScaledTPM.tsv" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'SRR1731762' sample.title = 'Danio rerio' sample.organism = danio_rerio sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "SRR1731762_output_gene_lengthScaledTPM.tsv" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'SRP051449': ['SRR1731761', 'SRR1731762']} ds.aggregate_by = 'SPECIES' ds.scale_by = 'NONE' ds.email_address = "*****@*****.**" ds.quantile_normalize = True ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() cr = ComputationalResult() cr.save() computed_file = ComputedFile() computed_file.filename = "danio_target.tsv" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = cr computed_file.size_in_bytes = 123 computed_file.is_smashable = False computed_file.save() cra = ComputationalResultAnnotation() cra.data = {'organism_id': danio_rerio.id, 'is_qn': True} cra.result = cr cra.save() final_context = smasher.smash(job.pk, upload=False) self.assertTrue(final_context['success'])
def _create_result(job_context: Dict) -> Dict: """ Create the actual Result object""" # This is a NO-OP, but we make a ComputationalResult regardless. result = ComputationalResult() result.commands.append(job_context["script_name"]) result.is_ccdl = True try: processor_key = "SUBMITTER_PROCESSED" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context["pipeline"].steps.append(result.id) # Create a ComputedFile for the computed file, # sync it S3 and save it. computed_file = ComputedFile() computed_file.absolute_file_path = job_context["output_file_path"] computed_file.filename = job_context["output_file_path"].split("/")[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.save() # utils.end_job will sync this to S3 for us. job_context["computed_files"] = [computed_file] for sample in job_context["samples"]: assoc = SampleResultAssociation() assoc.sample = sample assoc.result = result assoc.save() SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file) logger.debug("Created %s", result) job_context["success"] = True return job_context
def _create_result_objects(job_context: Dict) -> Dict: """ Create the ComputationalResult objects after a Scan run is complete """ result = ComputationalResult() result.commands.append("SCAN.UPC::SCAN_TwoColor") result.is_ccdl = True result.is_public = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "AGILENT_TWOCOLOR" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context["pipeline"].steps.append(result.id) # Create a ComputedFile for the sample, # sync it S3 and save it. try: computed_file = ComputedFile() computed_file.absolute_file_path = job_context["output_file_path"] computed_file.filename = os.path.split( job_context["output_file_path"])[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.save() job_context["computed_files"].append(computed_file) except Exception: logger.exception( "Exception caught while moving file %s to S3", computed_file.filename, processor_job=job_context["job_id"], ) failure_reason = "Exception caught while moving file to S3" job_context["job"].failure_reason = failure_reason job_context["success"] = False return job_context for sample in job_context["samples"]: assoc = SampleResultAssociation() assoc.sample = sample assoc.result = result assoc.save() SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file) logger.info("Created %s", result) job_context["success"] = True return job_context
def test_get_results(self): """ Test our ability to collect the appropriate samples. """ sample = Sample() sample.accession_code = 'GSM45588' sample.save() result = ComputationalResult() result.save() computed_file1 = ComputedFile() computed_file1.filename = "oh_boy.txt" computed_file1.result = result computed_file1.size_in_bytes = 123 computed_file1.is_smashable = True computed_file1.save() computed_file2 = ComputedFile() computed_file2.filename = "gee_whiz.bmp" computed_file2.result = result computed_file2.size_in_bytes = 123 computed_file2.is_smashable = False computed_file2.save() assoc = SampleResultAssociation() assoc.sample = sample assoc.result = result assoc.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file1 assoc.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file2 assoc.save() computed_files = sample.get_result_files() self.assertEqual(computed_files.count(), 2)
def prepare_job(): pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sample_annotation = SampleAnnotation() sample_annotation.data = {'hi': 'friend'} sample_annotation.sample = sample sample_annotation.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1237810_T09-1084.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1237812' sample.title = 'GSM1237812' sample.organism = homo_sapiens sample.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "GSM1237812_S97-PURE.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() computed_file = ComputedFile() computed_file.filename = "GSM1237812_S97-PURE.DAT" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = False computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812']} ds.aggregate_by = 'EXPERIMENT' # [ALL or SPECIES or EXPERIMENT] ds.scale_by = 'STANDARD' # [NONE or MINMAX or STANDARD or ROBUST] ds.email_address = "*****@*****.**" #ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() return pj
def test_bad_overlap(self): pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sample_annotation = SampleAnnotation() sample_annotation.data = {'hi': 'friend'} sample_annotation.sample = sample sample_annotation.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "big.PCL" computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1237812' sample.title = 'GSM1237812' sample.organism = homo_sapiens sample.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "small.PCL" computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812']} ds.aggregate_by = 'ALL' # [ALL or SPECIES or EXPERIMENT] ds.scale_by = 'NONE' # [NONE or MINMAX or STANDARD or ROBUST] ds.email_address = "*****@*****.**" #ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) ds = Dataset.objects.get(id=ds.id) pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() # Now, make sure the bad can't zero this out. sample = Sample() sample.accession_code = 'GSM999' sample.title = 'GSM999' sample.organism = homo_sapiens sample.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "bad.PCL" computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812', 'GSM999']} ds.aggregate_by = 'ALL' # [ALL or SPECIES or EXPERIMENT] ds.scale_by = 'NONE' # [NONE or MINMAX or STANDARD or ROBUST] ds.email_address = "*****@*****.**" #ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) ds = Dataset.objects.get(id=ds.id) self.assertEqual(len(final_context['final_frame']), 4)
def test_compendia(self): result = ComputationalResult() result.save() hsc1 = ComputedFile() hsc1.absolute_file_path = "/null/1.tsv" hsc1.filename = "1.tsv" hsc1.sha1 = "abc" hsc1.size_in_bytes = 1 hsc1.is_smashable = False hsc1.is_qn_target = False hsc1.result = result hsc1.is_compendia = True hsc1.compendia_organism = self.homo_sapiens hsc1.compendia_version = 1 hsc1.s3_bucket = "dr-compendia" hsc1.s3_key = "hsc1.tsv" hsc1.save() hsc2 = ComputedFile() hsc2.absolute_file_path = "/null/2.tsv" hsc2.filename = "2.tsv" hsc2.sha1 = "abc" hsc2.size_in_bytes = 1 hsc2.is_smashable = False hsc2.is_qn_target = False hsc2.result = result hsc2.is_compendia = True hsc2.compendia_organism = self.homo_sapiens hsc2.compendia_version = 2 hsc2.s3_bucket = "dr-compendia" hsc2.s3_key = "hsc2.tsv" hsc2.save() drc1 = ComputedFile() drc1.absolute_file_path = "/null/1.tsv" drc1.filename = "1.tsv" drc1.sha1 = "abc" drc1.size_in_bytes = 1 drc1.is_smashable = False drc1.is_qn_target = False drc1.result = result drc1.is_compendia = True drc1.compendia_organism = self.danio_rerio drc1.compendia_version = 1 drc1.s3_bucket = "dr-compendia" drc1.s3_key = "drc2.tsv" drc1.save() response = self.client.get( reverse("computed_files", kwargs={"version": API_VERSION}), {"is_compendia": True}) response_json = response.json()["results"] self.assertEqual(3, len(response_json)) # Prove that the download_url field is missing and not None. self.assertEqual("NotPresent", response_json[0].get("download_url", "NotPresent")) # We don't actually want AWS to generate a temporary URL for # us, and it won't unless we're running in the cloud, but if # we provide an API Token and use the WithUrl serializer then # it will set the download_url field to None rather than # generate one. # Create a token first response = self.client.post( reverse("token", kwargs={"version": API_VERSION}), json.dumps({"is_activated": True}), content_type="application/json", ) token_id = response.json()["id"] response = self.client.get( reverse("computed_files", kwargs={"version": API_VERSION}), {"is_compendia": True}, HTTP_API_KEY=token_id, ) response_json = response.json()["results"] self.assertEqual(3, len(response_json)) self.assertIsNone(response_json[0]["download_url"])
def test_log2(self): pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() # Has non-log2 data: # https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE44421 # ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE44nnn/GSE44421/miniml/GSE44421_family.xml.tgz experiment = Experiment() experiment.accession_code = "GSE44421" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1084806' sample.title = 'GSM1084806' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1084806-tbl-1.txt" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1084807' sample.title = 'GSM1084807' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1084807-tbl-1.txt" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE44421': ['GSM1084806', 'GSM1084807']} ds.aggregate_by = 'EXPERIMENT' ds.scale_by = 'MINMAX' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) ds = Dataset.objects.get(id=ds.id) self.assertTrue(final_context['success'])
def _create_result_objects(job_context: Dict) -> Dict: """ Create the ComputationalResult objects after a Scan run is complete """ result = ComputationalResult() result.commands.append('SCAN.UPC::SCANfast') result.is_ccdl = True result.is_public = True result.time_start = job_context['time_start'] result.time_end = job_context['time_end'] try: processor_key = "AFFYMETRIX_SCAN" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context['pipeline'].steps.append(result.id) # Create a ComputedFile for the sample computed_file = ComputedFile() computed_file.absolute_file_path = job_context["output_file_path"] computed_file.filename = os.path.split(job_context["output_file_path"])[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.save() job_context['computed_files'].append(computed_file) for sample in job_context['samples']: assoc = SampleResultAssociation() assoc.sample = sample assoc.result = result assoc.save() SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file) logger.debug("Created %s", result, processor_job=job_context["job_id"]) job_context["success"] = True return job_context
def _create_result_objects(job_context: Dict) -> Dict: """ Create the ComputationalResult objects after a Scan run is complete """ result = ComputationalResult() result.commands.append(job_context['formatted_command']) result.is_ccdl = True result.is_public = True result.time_start = job_context['time_start'] result.time_end = job_context['time_end'] try: processor_key = "ILLUMINA_SCAN" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context['pipeline'].steps.append(result.id) # Split the result into smashable subfiles big_tsv = job_context["output_file_path"] data = pd.read_csv(big_tsv, sep='\t', header=0, index_col=0) individual_files = [] frames = np.split(data, len(data.columns), axis=1) for frame in frames: filename = frame.columns.values[0].replace('&', '').replace( "*", '').replace(";", '') + '.tsv' frame_path = job_context["work_dir"] + filename frame.to_csv(frame_path, sep='\t', encoding='utf-8') # This needs to be the same as the ones in the job context! try: sample = job_context['samples'].get(title=frame.columns.values[0]) except Sample.DoesNotExist: logger.error( "Could not find sample for column while splitting Illumina file.", title=frame.columns.values[0], processor_job=job_context["job_id"], file_path=big_tsv, ) continue computed_file = ComputedFile() computed_file.absolute_file_path = frame_path computed_file.filename = frame_path.split('/')[-1] computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.is_public = True computed_file.calculate_sha1() computed_file.calculate_size() computed_file.save() job_context['computed_files'].append(computed_file) SampleResultAssociation.objects.get_or_create(sample=sample, result=result) SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file) individual_files.append(computed_file) logger.debug("Created %s", result) job_context["success"] = True job_context["individual_files"] = individual_files job_context["result"] = result return job_context
def _create_result_objects(job_context: Dict) -> Dict: if not job_context["create_results"]: return job_context result = ComputationalResult() result.commands.append(" ".join(job_context["formatted_command"])) result.is_ccdl = True result.is_public = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "QN_REFERENCE" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() computed_file = ComputedFile() computed_file.absolute_file_path = job_context["target_file"] computed_file.filename = job_context["target_file"].split("/")[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.is_smashable = False computed_file.is_qn_target = True computed_file.result = result computed_file.save() annotation = ComputationalResultAnnotation() annotation.result = result annotation.data = { "organism_id": job_context["samples"]["ALL"][0].organism_id, "is_qn": True, "platform_accession_code": job_context["samples"]["ALL"][0].platform_accession_code, "samples": [sample.accession_code for sample in job_context["samples"]["ALL"]], "geneset": str(job_context["geneset"]), "num_valid_inputs": job_context["num_valid_inputs"], } annotation.save() job_context["result"] = result job_context["computed_files"] = [computed_file] job_context["annotation"] = annotation job_context["success"] = True return job_context
def test_no_smash_all_diff_species(self): """ Smashing together with 'ALL' with different species is a really weird behavior. This test isn't really testing a normal case, just make sure that it's marking the unsmashable files. """ job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1237810_T09-1084.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() experiment = Experiment() experiment.accession_code = "GSE51084" experiment.save() mus_mus = Organism.get_object_for_name("MUS_MUSCULUS") sample = Sample() sample.accession_code = 'GSM1238108' sample.title = 'GSM1238108' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1238108-tbl-1.txt" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810'], 'GSE51084': ['GSM1238108']} ds.aggregate_by = 'ALL' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() final_context = smasher.smash(job.pk, upload=False) dsid = ds.id ds = Dataset.objects.get(id=dsid) print(ds.failure_reason) print(final_context['dataset'].failure_reason) self.assertEqual(final_context['unsmashable_files'], ['GSM1238108'])
def _populate_index_object(job_context: Dict) -> Dict: """ """ result = ComputationalResult() result.commands.append(job_context["salmon_formatted_command"]) try: processor_key = "TX_INDEX" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.is_ccdl = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] result.save() job_context['pipeline'].steps.append(result.id) computed_file = ComputedFile() computed_file.absolute_file_path = job_context["computed_archive"] computed_file.filename = os.path.split(job_context["computed_archive"])[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.result = result computed_file.is_smashable = False computed_file.is_qc = False computed_file.save() organism_object = Organism.get_object_for_name(job_context['organism_name']) index_object = OrganismIndex() index_object.organism = organism_object index_object.source_version = job_context["assembly_version"] index_object.assembly_name = job_context["assembly_name"] index_object.salmon_version = job_context["salmon_version"] index_object.index_type = "TRANSCRIPTOME_" + job_context['length'].upper() # This is where the index will be extracted to. index_object.absolute_directory_path = LOCAL_ROOT_DIR + "/TRANSCRIPTOME_INDEX/" \ + organism_object.name + "/" + job_context['length'] index_object.result = result if S3_TRANSCRIPTOME_INDEX_BUCKET_NAME: logger.info("Uploading %s %s to s3", job_context['organism_name'], job_context['length'], processor_job=job_context["job_id"]) timestamp = str(timezone.now().timestamp()).split('.')[0] s3_key = organism_object.name + '_' + index_object.index_type + "_" + timestamp + '.tar.gz' sync_result = computed_file.sync_to_s3(S3_TRANSCRIPTOME_INDEX_BUCKET_NAME, s3_key) if sync_result: computed_file.delete_local_file() else: logger.warn("S3_TRANSCRIPTOME_INDEX_BUCKET_NAME not configured, therefore %s %s will not be uploaded.", job_context['organism_name'], job_context['length'], processor_job=job_context["job_id"]) index_object.save() # We uploaded the file ourselves since we wanted it to go to a # different bucket than end_job would put it in, therefore empty # this list so end_job doesn't try to upload it again. job_context['computed_files'] = [] job_context['result'] = result job_context['computed_file'] = computed_file job_context['index'] = index_object # If there's not a long and a short index for this organism yet, # don't delete the input. # XXX: This will break once we introduce additional versions of these. short_indices = OrganismIndex.objects.filter(organism=organism_object, index_type="TRANSCRIPTOME_SHORT", source_version=job_context["assembly_version"]) long_indices = OrganismIndex.objects.filter(organism=organism_object, index_type="TRANSCRIPTOME_LONG", source_version=job_context["assembly_version"]) if short_indices.count() < 1 or long_indices.count() < 1: # utils.end_job deletes these, so remove them so it doesn't. job_context["original_files"] = [] return job_context
def test_no_smash_dupe(self): """ """ job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1237810_T09-1084.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1237811' sample.title = 'GSM1237811' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237811']} ds.aggregate_by = 'ALL' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() final_context = smasher.smash(job.pk, upload=False) dsid = ds.id ds = Dataset.objects.get(id=dsid) self.assertTrue(ds.success) for column in final_context['original_merged'].columns: self.assertTrue('_x' not in column)