def test_notify(self): ds = Dataset() ds.data = {'GSM1487313': ['GSM1487313'], 'SRS332914': ['SRS332914']} ds.aggregate_by = 'SPECIES' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() job_context = {} job_context['job'] = pj job_context['dataset'] = ds job_context['upload'] = True job_context[ 'result_url'] = 'https://s3.amazonaws.com/data-refinery-test-assets/all_the_things.jpg' final_context = smasher._notify(job_context) self.assertTrue(final_context.get('success', True))
def test_fail(self): """ Test our ability to fail """ result = ComputationalResult() result.save() sample = Sample() sample.accession_code = 'XXX' sample.title = 'XXX' sample.organism = Organism.get_object_for_name("HOMO_SAPIENS") sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "NOT_REAL.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['XXX']} ds.aggregate_by = 'EXPERIMENT' ds.scale_by = 'MINMAX' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() dsid = ds.id job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() final_context = smasher.smash(job.pk, upload=False) ds = Dataset.objects.get(id=dsid) print(ds.failure_reason) print(final_context['dataset'].failure_reason) self.assertNotEqual(final_context['unsmashable_files'], [])
def test_qn_reference(self): job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606) homo_sapiens.save() experiment = Experiment() experiment.accession_code = "12345" experiment.save() # We don't have a 0.tsv codes = [str(i) for i in range(1, 201)] for code in codes: sample = Sample() sample.accession_code = code sample.title = code sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS" sample.organism = homo_sapiens sample.technology = "MICROARRAY" sample.is_processed = True sample.save() cr = ComputationalResult() cr.save() computed_file = ComputedFile() computed_file.filename = code + ".tsv" computed_file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv" computed_file.size_in_bytes = int(code) computed_file.result = cr computed_file.is_smashable = True computed_file.save() scfa = SampleComputedFileAssociation() scfa.sample = sample scfa.computed_file = computed_file scfa.save() exsa = ExperimentSampleAssociation() exsa.experiment = experiment exsa.sample = sample exsa.save() dataset = Dataset() dataset.data = {"12345": ["1", "2", "3", "4", "5", "6"]} dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False # We don't QN because we're creating the target now dataset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() final_context = qn_reference.create_qn_reference(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["target_file"])) self.assertEqual(os.path.getsize(final_context["target_file"]), 562) homo_sapiens.refresh_from_db() target = homo_sapiens.qn_target.computedfile_set.latest() self.assertEqual(target.sha1, "de69d348f8b239479e2330d596c4013a7b0b2b6a") # Create and run a smasher job that will use the QN target we just made. pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() ds = Dataset() ds.data = {"12345": ["1", "2", "3", "4", "5"]} ds.aggregate_by = "SPECIES" ds.scale_by = "STANDARD" ds.email_address = "*****@*****.**" ds.quantile_normalize = True ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(final_context["success"]) np.testing.assert_almost_equal(final_context["merged_qn"]["1"][0], -0.4379488527774811) np.testing.assert_almost_equal(final_context["original_merged"]["1"][0], -0.5762109)
def test_dualtech_smash(self): """ """ pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1487313" experiment.save() result = ComputationalResult() result.save() gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS") sample = Sample() sample.accession_code = 'GSM1487313' sample.title = 'GSM1487313' sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487313_liver.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # RNASEQ TECH experiment2 = Experiment() experiment2.accession_code = "SRS332914" experiment2.save() result2 = ComputationalResult() result2.save() sample2 = Sample() sample2.accession_code = 'SRS332914' sample2.title = 'SRS332914' sample2.organism = gallus_gallus sample2.technology = "RNA-SEQ" sample2.save() sra2 = SampleResultAssociation() sra2.sample = sample2 sra2.result = result2 sra2.save() esa2 = ExperimentSampleAssociation() esa2.experiment = experiment2 esa2.sample = sample2 esa2.save() computed_file2 = ComputedFile() computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv" computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename computed_file2.result = result2 computed_file2.size_in_bytes = 234 computed_file2.is_smashable = True computed_file2.save() assoc2 = SampleComputedFileAssociation() assoc2.sample = sample2 assoc2.computed_file = computed_file2 assoc2.save() # CROSS-SMASH BY SPECIES ds = Dataset() ds.data = {'GSE1487313': ['GSM1487313'], 'SRX332914': ['SRS332914']} ds.aggregate_by = 'SPECIES' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() self.assertTrue(ds.is_cross_technology()) final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(os.path.exists(final_context['output_file'])) os.remove(final_context['output_file']) self.assertEqual(len(final_context['final_frame'].columns), 2) # THEN BY EXPERIMENT ds.aggregate_by = 'EXPERIMENT' ds.save() dsid = ds.id ds = Dataset.objects.get(id=dsid) pj.start_time = None pj.end_time = None pj.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(os.path.exists(final_context['output_file'])) os.remove(final_context['output_file']) self.assertEqual(len(final_context['final_frame'].columns), 1) # THEN BY ALL ds.aggregate_by = 'ALL' ds.save() dsid = ds.id ds = Dataset.objects.get(id=dsid) pj.start_time = None pj.end_time = None pj.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(os.path.exists(final_context['output_file'])) self.assertEqual(len(final_context['final_frame'].columns), 2)
def test_log2(self): pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() # Has non-log2 data: # https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE44421 # ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE44nnn/GSE44421/miniml/GSE44421_family.xml.tgz experiment = Experiment() experiment.accession_code = "GSE44421" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1084806' sample.title = 'GSM1084806' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1084806-tbl-1.txt" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1084807' sample.title = 'GSM1084807' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1084807-tbl-1.txt" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE44421': ['GSM1084806', 'GSM1084807']} ds.aggregate_by = 'EXPERIMENT' ds.scale_by = 'MINMAX' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) ds = Dataset.objects.get(id=ds.id) self.assertTrue(final_context['success'])
def test_no_smash_dupe_two(self): """ Tests the SRP051449 case, where the titles collide. Also uses a real QN target file.""" job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() experiment = Experiment() experiment.accession_code = "SRP051449" experiment.save() result = ComputationalResult() result.save() danio_rerio = Organism.get_object_for_name("DANIO_RERIO") sample = Sample() sample.accession_code = 'SRR1731761' sample.title = 'Danio rerio' sample.organism = danio_rerio sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "SRR1731761_output_gene_lengthScaledTPM.tsv" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'SRR1731762' sample.title = 'Danio rerio' sample.organism = danio_rerio sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "SRR1731762_output_gene_lengthScaledTPM.tsv" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'SRP051449': ['SRR1731761', 'SRR1731762']} ds.aggregate_by = 'SPECIES' ds.scale_by = 'NONE' ds.email_address = "*****@*****.**" ds.quantile_normalize = True ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() cr = ComputationalResult() cr.save() computed_file = ComputedFile() computed_file.filename = "danio_target.tsv" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = cr computed_file.size_in_bytes = 123 computed_file.is_smashable = False computed_file.save() cra = ComputationalResultAnnotation() cra.data = {'organism_id': danio_rerio.id, 'is_qn': True} cra.result = cr cra.save() final_context = smasher.smash(job.pk, upload=False) self.assertTrue(final_context['success'])
def test_no_smash_dupe(self): """ """ job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1237810_T09-1084.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1237811' sample.title = 'GSM1237811' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237811']} ds.aggregate_by = 'ALL' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() final_context = smasher.smash(job.pk, upload=False) dsid = ds.id ds = Dataset.objects.get(id=dsid) self.assertTrue(ds.success) for column in final_context['original_merged'].columns: self.assertTrue('_x' not in column)
def test_no_smash_all_diff_species(self): """ Smashing together with 'ALL' with different species is a really weird behavior. This test isn't really testing a normal case, just make sure that it's marking the unsmashable files. """ job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1237810_T09-1084.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() experiment = Experiment() experiment.accession_code = "GSE51084" experiment.save() mus_mus = Organism.get_object_for_name("MUS_MUSCULUS") sample = Sample() sample.accession_code = 'GSM1238108' sample.title = 'GSM1238108' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1238108-tbl-1.txt" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810'], 'GSE51084': ['GSM1238108']} ds.aggregate_by = 'ALL' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() final_context = smasher.smash(job.pk, upload=False) dsid = ds.id ds = Dataset.objects.get(id=dsid) print(ds.failure_reason) print(final_context['dataset'].failure_reason) self.assertEqual(final_context['unsmashable_files'], ['GSM1238108'])
def prepare_job(): pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sample_annotation = SampleAnnotation() sample_annotation.data = {'hi': 'friend'} sample_annotation.sample = sample sample_annotation.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1237810_T09-1084.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1237812' sample.title = 'GSM1237812' sample.organism = homo_sapiens sample.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "GSM1237812_S97-PURE.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() computed_file = ComputedFile() computed_file.filename = "GSM1237812_S97-PURE.DAT" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = False computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812']} ds.aggregate_by = 'EXPERIMENT' # [ALL or SPECIES or EXPERIMENT] ds.scale_by = 'STANDARD' # [NONE or MINMAX or STANDARD or ROBUST] ds.email_address = "*****@*****.**" #ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() return pj
def test_bad_overlap(self): pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sample_annotation = SampleAnnotation() sample_annotation.data = {'hi': 'friend'} sample_annotation.sample = sample sample_annotation.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "big.PCL" computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1237812' sample.title = 'GSM1237812' sample.organism = homo_sapiens sample.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "small.PCL" computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812']} ds.aggregate_by = 'ALL' # [ALL or SPECIES or EXPERIMENT] ds.scale_by = 'NONE' # [NONE or MINMAX or STANDARD or ROBUST] ds.email_address = "*****@*****.**" #ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) ds = Dataset.objects.get(id=ds.id) pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() # Now, make sure the bad can't zero this out. sample = Sample() sample.accession_code = 'GSM999' sample.title = 'GSM999' sample.organism = homo_sapiens sample.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "bad.PCL" computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812', 'GSM999']} ds.aggregate_by = 'ALL' # [ALL or SPECIES or EXPERIMENT] ds.scale_by = 'NONE' # [NONE or MINMAX or STANDARD or ROBUST] ds.email_address = "*****@*****.**" #ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) ds = Dataset.objects.get(id=ds.id) self.assertEqual(len(final_context['final_frame']), 4)
def test_qn_reference(self): # We don't have a 0.tsv experiment = prepare_experiment(range(1, 201)) job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() dataset = Dataset() dataset.data = {"12345": ["1", "2", "3", "4", "5", "6"]} dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False # We don't QN because we're creating the target now dataset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() final_context = qn_reference.create_qn_reference(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["target_file"])) self.assertEqual(os.path.getsize(final_context["target_file"]), 562) homo_sapiens = Organism.objects.get(taxonomy_id=9606) target = homo_sapiens.qn_target.computedfile_set.latest() self.assertEqual(target.sha1, "de69d348f8b239479e2330d596c4013a7b0b2b6a") # Create and run a smasher job that will use the QN target we just made. pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() ds = Dataset() ds.data = {"12345": ["1", "2", "3", "4", "5"]} ds.aggregate_by = "SPECIES" ds.scale_by = "STANDARD" ds.email_address = "*****@*****.**" ds.quantile_normalize = True ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(final_context["success"]) np.testing.assert_almost_equal(final_context["merged_qn"]["1"][0], -0.4379488527774811) np.testing.assert_almost_equal( final_context["original_merged"]["1"][0], -0.5762109) # Make sure that the results were created. We create 200 computed files # and computational results (1 for each sample) plus the one generated # by the QN reference processor. self.assertEqual(ComputedFile.objects.all().count(), 200 + 1) self.assertEqual(ComputationalResult.objects.all().count(), 200 + 1) self.assertEqual(ComputationalResultAnnotation.objects.all().count(), 1)
def test_qn_reference(self): job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") experiment = Experiment() experiment.accession_code = "12345" experiment.save() for code in ['1', '2', '3', '4', '5', '6']: sample = Sample() sample.accession_code = code sample.title = code sample.platform_accession_code = 'A-MEXP-1171' sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS" sample.organism = homo_sapiens sample.technology = "MICROARRAY" sample.is_processed = True sample.save() cr = ComputationalResult() cr.save() file = ComputedFile() file.filename = code + ".tsv" file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv" file.size_in_bytes = int(code) file.result = cr file.is_smashable = True file.save() scfa = SampleComputedFileAssociation() scfa.sample = sample scfa.computed_file = file scfa.save() exsa = ExperimentSampleAssociation() exsa.experiment = experiment exsa.sample = sample exsa.save() dataset = Dataset() dataset.data = {"12345": ["1", "2", "3", "4", "5", "6"]} dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False # We don't QN because we're creating the target now dataset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() final_context = qn_reference.create_qn_reference(job.pk) self.assertTrue(final_context['success']) self.assertTrue(os.path.exists(final_context['target_file'])) self.assertEqual(os.path.getsize(final_context['target_file']), 556) target = utils.get_most_recent_qn_target_for_organism(homo_sapiens) self.assertEqual(target.sha1, '636d72d5cbf4b9785b0bd271a1430b615feaa7ea') ### # Smasher with QN ### pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() ds = Dataset() ds.data = {"12345": ["1", "2", "3", "4", "5"]} ds.aggregate_by = 'SPECIES' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = True ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(final_context['success']) self.assertEqual(final_context['merged_qn']['1'][0], -0.4379488528812934) self.assertEqual(final_context['original_merged']['1'][0], -0.576210936113982) ## # Test via management command ## from django.core.management import call_command from django.test import TestCase from django.utils.six import StringIO out = StringIO() try: call_command('create_qn_target', organism='homo_sapiens', min=1, stdout=out) except SystemExit as e: # this is okay! pass stdout = out.getvalue() self.assertTrue('Target file' in stdout) path = stdout.split('\n')[0].split(':')[1].strip() self.assertTrue(os.path.exists(path)) self.assertEqual(path, utils.get_most_recent_qn_target_for_organism(homo_sapiens).absolute_file_path)
def test_create_quantpendia(self): job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_QUANTPENDIA.value job.save() experiment = Experiment() experiment.accession_code = "GSE51088" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS", taxonomy_id=9606) sample = Sample() sample.accession_code = "GSM1237818" sample.title = "GSM1237818" sample.organism = homo_sapiens sample.technology = "RNA-SEQ" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.s3_key = "smasher-test-quant.sf" computed_file.s3_bucket = "data-refinery-test-assets" computed_file.filename = "quant.sf" computed_file.absolute_file_path = "/home/user/data_store/QUANT/smasher-test-quant.sf" computed_file.result = result computed_file.is_smashable = True computed_file.size_in_bytes = 123123 computed_file.sha1 = ( "08c7ea90b66b52f7cd9d9a569717a1f5f3874967" # this matches with the downloaded file ) computed_file.save() computed_file = ComputedFile() computed_file.filename = "logquant.tsv" computed_file.is_smashable = True computed_file.size_in_bytes = 123123 computed_file.result = result computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {"GSE51088": ["GSM1237818"]} ds.aggregate_by = "EXPERIMENT" ds.scale_by = "STANDARD" ds.email_address = "*****@*****.**" ds.quant_sf_only = True # Make the dataset include quant.sf files only ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() final_context = create_quantpendia(job.id) self.assertTrue( os.path.exists(final_context["output_dir"] + "/GSE51088/GSM1237818_quant.sf")) self.assertTrue( os.path.exists(final_context["output_dir"] + "/README.md")) self.assertTrue( os.path.exists(final_context["output_dir"] + "/LICENSE.TXT")) self.assertTrue( os.path.exists(final_context["output_dir"] + "/aggregated_metadata.json")) self.assertTrue(final_context["metadata"]["quant_sf_only"]) self.assertEqual(final_context["metadata"]["num_samples"], 1) self.assertEqual(final_context["metadata"]["num_experiments"], 1) # test that archive exists quantpendia_file = ComputedFile.objects.filter( is_compendia=True, quant_sf_only=True).latest() self.assertTrue(os.path.exists(quantpendia_file.absolute_file_path))