def handle(self, *args, **options): """ For every (or a supplied) organism, fetch all of the experiments and compile large but normally formated Dataset. Send all of them to the Smasher. Smash them. Retrieve manually as desired. """ dataset_ids = [] if options["organism"] is None: all_organisms = Organism.objects.all() else: all_organisms = [ Organism.get_object_for_name(options["organism"].upper()) ] for organism in all_organisms: data = {} experiments = Experiment.objects.filter( id__in=(ExperimentOrganismAssociation.objects.filter( organism=organism)).values('experiment')) for experiment in experiments: data[experiment.accession_code] = list( experiment.samples.filter(organism=organism).values_list( 'accession_code', flat=True)) job = ProcessorJob() job.pipeline_applied = "COMPENDIA" job.save() dset = Dataset() dset.data = data dset.scale_by = 'NONE' dset.aggregate_by = 'SPECIES' dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) sys.exit(0)
def handle(self, *args, **options): if options["job_id"] is None: logger.error("You must specify a job ID.", job_id=options["job_id"]) sys.exit(1) try: job_type = ProcessorPipeline[options["job_name"]] except KeyError: logger.error( "You must specify a valid job name.", job_name=options["job_name"], job_id=options["job_id"], ) sys.exit(1) if job_type is ProcessorPipeline.AFFY_TO_PCL: from data_refinery_workers.processors.array_express import affy_to_pcl affy_to_pcl(options["job_id"]) elif job_type is ProcessorPipeline.TRANSCRIPTOME_INDEX_SHORT: from data_refinery_workers.processors.transcriptome_index import ( build_transcriptome_index, ) build_transcriptome_index(options["job_id"], length="short") elif job_type is ProcessorPipeline.TRANSCRIPTOME_INDEX_LONG: from data_refinery_workers.processors.transcriptome_index import ( build_transcriptome_index, ) build_transcriptome_index(options["job_id"], length="long") elif job_type is ProcessorPipeline.AGILENT_TWOCOLOR_TO_PCL: from data_refinery_workers.processors.agilent_twocolor import agilent_twocolor_to_pcl agilent_twocolor_to_pcl(options["job_id"]) elif job_type is ProcessorPipeline.ILLUMINA_TO_PCL: from data_refinery_workers.processors.illumina import illumina_to_pcl illumina_to_pcl(options["job_id"]) elif job_type is ProcessorPipeline.SALMON: from data_refinery_workers.processors.salmon import salmon salmon(options["job_id"]) elif job_type is ProcessorPipeline.TXIMPORT: from data_refinery_workers.processors.tximport import tximport tximport(options["job_id"]) elif job_type is ProcessorPipeline.SMASHER: from data_refinery_workers.processors.smasher import smash smash(options["job_id"]) elif job_type is ProcessorPipeline.CREATE_COMPENDIA: from data_refinery_workers.processors.create_compendia import create_compendia create_compendia(options["job_id"]) elif job_type is ProcessorPipeline.CREATE_QUANTPENDIA: from data_refinery_workers.processors.create_quantpendia import create_quantpendia create_quantpendia(options["job_id"]) elif job_type is ProcessorPipeline.NO_OP: from data_refinery_workers.processors.no_op import no_op_processor no_op_processor(options["job_id"]) elif job_type is ProcessorPipeline.JANITOR: from data_refinery_workers.processors.janitor import run_janitor run_janitor(options["job_id"]) elif job_type is ProcessorPipeline.QN_REFERENCE: from data_refinery_workers.processors import qn_reference qn_reference.create_qn_reference(options["job_id"]) else: logger.error( ( "A valid job name was specified for job %s with id %d but " "no processor function is known to run it." ), options["job_name"], options["job_id"], ) sys.exit(1) sys.exit(0)
def test_create_compendia(self): job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1487313" experiment.save() result = ComputationalResult() result.save() gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS", taxonomy_id=1001) sample = Sample() sample.accession_code = "GSM1487313" sample.title = "GSM1487313" sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487313_liver.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # Missing sample that will be filtered sample = Sample() sample.accession_code = "GSM1487222" sample.title = "this sample will be filtered" sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487222_empty.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/doesnt_exists.PCL" computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # RNASEQ TECH experiment2 = Experiment() experiment2.accession_code = "SRS332914" experiment2.save() result2 = ComputationalResult() result2.save() sample2 = Sample() sample2.accession_code = "SRS332914" sample2.title = "SRS332914" sample2.organism = gallus_gallus sample2.technology = "RNA-SEQ" sample2.save() sra2 = SampleResultAssociation() sra2.sample = sample2 sra2.result = result2 sra2.save() esa2 = ExperimentSampleAssociation() esa2.experiment = experiment2 esa2.sample = sample2 esa2.save() computed_file2 = ComputedFile() computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv" computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename computed_file2.result = result2 computed_file2.size_in_bytes = 234 computed_file2.is_smashable = True computed_file2.save() assoc2 = SampleComputedFileAssociation() assoc2.sample = sample2 assoc2.computed_file = computed_file2 assoc2.save() dset = Dataset() dset.data = { "GSE1487313": ["GSM1487313", "GSM1487222"], "SRX332914": ["SRS332914"] } dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) self.assertFalse(job.success) # check that sample with no computed file was skipped self.assertTrue("GSM1487222" in final_context["filtered_samples"]) self.assertEqual( final_context["filtered_samples"]["GSM1487222"] ["experiment_accession_code"], "GSE1487313", )
def test_create_compendia_danio(self): job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1234" experiment.save() result = ComputationalResult() result.save() qn_target = ComputedFile() qn_target.filename = "danio_target.tsv" qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv" qn_target.is_qn_target = True qn_target.size_in_bytes = "12345" qn_target.sha1 = "aabbccddeeff" qn_target.result = result qn_target.save() danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1, qn_target=result) danio_rerio.save() cra = ComputationalResultAnnotation() cra.data = {} cra.data["organism_id"] = danio_rerio.id cra.data["is_qn"] = True cra.result = result cra.save() result = ComputationalResult() result.save() micros = [] for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"): if "microarray.txt" in file: continue sample = Sample() sample.accession_code = file sample.title = file sample.organism = danio_rerio sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = file computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/MICROARRAY/" + file computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() micros.append(file) experiment = Experiment() experiment.accession_code = "GSE5678" experiment.save() result = ComputationalResult() result.save() rnas = [] for file in os.listdir("/home/user/data_store/raw/TEST/RNASEQ/"): if "rnaseq.txt" in file: continue sample = Sample() sample.accession_code = file sample.title = file sample.organism = danio_rerio sample.technology = "RNASEQ" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = file computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/RNASEQ/" + file computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() rnas.append(file) # Missing sample that will be filtered sample = Sample() sample.accession_code = "GSM1487222" sample.title = "this sample will be filtered" sample.organism = danio_rerio sample.technology = "RNASEQ" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() rnas.append(sample.accession_code) dset = Dataset() dset.data = {"GSE1234": micros, "GSE5678": rnas} dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) # Verify result self.assertEqual( final_context["compendium_result"].result.computedfile_set.count(), 1) for file in final_context[ "compendium_result"].result.computedfile_set.all(): self.assertTrue(os.path.exists(file.absolute_file_path)) # test compendium_result self.assertEqual(final_context["compendium_result"].svd_algorithm, "ARPACK") self.assertEqual( final_context["compendium_result"].primary_organism.name, final_context["organism_name"]) self.assertEqual( final_context["compendium_result"].primary_organism.name, "DANIO_RERIO") self.assertEqual(final_context["compendium_result"].organisms.count(), 1) # check that sample with no computed file was skipped self.assertTrue("GSM1487222" in final_context["filtered_samples"]) self.assertEqual( final_context["filtered_samples"]["GSM1487222"] ["experiment_accession_code"], "GSE5678")
def test_create_compendia(self): job = ProcessorJob() job.pipeline_applied = "COMPENDIA" job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1487313" experiment.save() result = ComputationalResult() result.save() gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS") sample = Sample() sample.accession_code = 'GSM1487313' sample.title = 'GSM1487313' sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487313_liver.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # RNASEQ TECH experiment2 = Experiment() experiment2.accession_code = "SRS332914" experiment2.save() result2 = ComputationalResult() result2.save() sample2 = Sample() sample2.accession_code = 'SRS332914' sample2.title = 'SRS332914' sample2.organism = gallus_gallus sample2.technology = "RNA-SEQ" sample2.save() sra2 = SampleResultAssociation() sra2.sample = sample2 sra2.result = result2 sra2.save() esa2 = ExperimentSampleAssociation() esa2.experiment = experiment2 esa2.sample = sample2 esa2.save() computed_file2 = ComputedFile() computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv" computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename computed_file2.result = result2 computed_file2.size_in_bytes = 234 computed_file2.is_smashable = True computed_file2.save() assoc2 = SampleComputedFileAssociation() assoc2.sample = sample2 assoc2.computed_file = computed_file2 assoc2.save() dset = Dataset() dset.data = {'GSE1487313': ['GSM1487313'], 'SRX332914': ['SRS332914']} dset.scale_by = 'NONE' dset.aggregate_by = 'SPECIES' dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id)
def test_create_compendia_danio(self): job = ProcessorJob() job.pipeline_applied = "COMPENDIA" job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1234" experiment.save() result = ComputationalResult() result.save() danio_rerio = Organism.get_object_for_name("DANIO_RERIO") micros = [] for file in os.listdir('/home/user/data_store/raw/TEST/MICROARRAY/'): if 'microarray.txt' in file: continue sample = Sample() sample.accession_code = file sample.title = file sample.organism = danio_rerio sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = file computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/MICROARRAY/" + file computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() micros.append(file) experiment = Experiment() experiment.accession_code = "GSE5678" experiment.save() result = ComputationalResult() result.save() rnas = [] for file in os.listdir('/home/user/data_store/raw/TEST/RNASEQ/'): if 'rnaseq.txt' in file: continue sample = Sample() sample.accession_code = file sample.title = file sample.organism = danio_rerio sample.technology = "RNASEQ" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = file computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/RNASEQ/" + file computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() rnas.append(file) result = ComputationalResult() result.save() qn_target = ComputedFile() qn_target.filename = "danio_target.tsv" qn_target.absolute_file_path = '/home/user/data_store/QN/danio_target.tsv' qn_target.is_qn_target = True qn_target.size_in_bytes = "12345" qn_target.sha1 = "aabbccddeeff" qn_target.result = result qn_target.save() cra = ComputationalResultAnnotation() cra.data = {} cra.data['organism_id'] = danio_rerio.id cra.data['is_qn'] = True cra.result = result cra.save() dset = Dataset() dset.data = {'GSE1234': micros, 'GSE5678': rnas} dset.scale_by = 'NONE' dset.aggregate_by = 'SPECIES' dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) # Verify result self.assertEqual(len(final_context['computed_files']), 3) for file in final_context['computed_files']: self.assertTrue(os.path.exists(file.absolute_file_path))
def test_create_compendia(self): DATA_DIR = "/home/user/data_store/PCL/" job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS", taxonomy_id=1001) # MICROARRAY TECH (experiment, _) = Experiment.objects.get_or_create(accession_code="GSE1487313") experiment.accession_code = "GSE1487313" experiment.save() create_sample_for_experiment( { "organism": gallus_gallus, "accession_code": "GSM1487313", "technology": "MICROARRAY", "filename": "GSM1487313_liver.PCL", "data_dir": DATA_DIR, }, experiment, ) # Missing sample that will be filtered create_sample_for_experiment( { "organism": gallus_gallus, "accession_code": "GSM1487222", "title": "this sample will be filtered", "technology": "MICROARRAY", "filename": "GSM1487222_empty.PCL", "data_dir": DATA_DIR, }, experiment, ) # RNASEQ TECH experiment2 = Experiment() experiment2.accession_code = "SRP149598" experiment2.save() create_sample_for_experiment( { "organism": gallus_gallus, "accession_code": "SRR7250867", "technology": "RNA-SEQ", "filename": "SRP149598_gene_lengthScaledTPM.tsv", "data_dir": DATA_DIR, }, experiment, ) dset = Dataset() dset.data = { "GSE1487313": ["GSM1487313", "GSM1487222"], "SRP149598": ["SRR7250867"], } dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = True dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) # Because one of the samples is filtered out, there will be too few # remaining samples to smash together, so we expect this job to fail. self.assertFailed(job, "k must be between 1 and min(A.shape)") # check that sample with no computed file was skipped self.assertTrue("GSM1487222" in final_context["filtered_samples"]) self.assertEqual( final_context["filtered_samples"]["GSM1487222"] ["experiment_accession_code"], "GSE1487313", )
def test_create_compendia_microarray_only(self): """ Make sure that we can actually create a compendium with just microarray samples. """ job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1234" experiment.save() result = ComputationalResult() result.save() qn_target = ComputedFile() qn_target.filename = "danio_target.tsv" qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv" qn_target.is_qn_target = True qn_target.size_in_bytes = "12345" qn_target.sha1 = "aabbccddeeff" qn_target.result = result qn_target.save() danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1, qn_target=result) danio_rerio.save() cra = ComputationalResultAnnotation() cra.data = {} cra.data["organism_id"] = danio_rerio.id cra.data["is_qn"] = True cra.result = result cra.save() result = ComputationalResult() result.save() micros = [] for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"): if "microarray.txt" in file: continue create_sample_for_experiment( { "organism": danio_rerio, "accession_code": file, "technology": "MICROARRAY", "filename": file, "data_dir": "/home/user/data_store/raw/TEST/MICROARRAY/", }, experiment, ) micros.append(file) dset = Dataset() dset.data = {"GSE1234": micros} dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = True dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) self.assertSucceeded(job) # Verify result self.assertEqual( final_context["compendium_result"].result.computedfile_set.count(), 1) for file in final_context[ "compendium_result"].result.computedfile_set.all(): self.assertTrue(os.path.exists(file.absolute_file_path)) # test compendium_result self.assertEqual(final_context["compendium_result"].svd_algorithm, "ARPACK") self.assertEqual( final_context["compendium_result"].primary_organism.name, final_context["organism_name"], ) self.assertEqual( final_context["compendium_result"].primary_organism.name, "DANIO_RERIO") self.assertEqual(final_context["compendium_result"].organisms.count(), 1) zf = zipfile.ZipFile(final_context["compendium_result"].result. computedfile_set.first().absolute_file_path) with zf.open("aggregated_metadata.json") as f: metadata = json.load(f) self.assertFalse(metadata.get("quant_sf_only")) # 420 microarray self.assertEqual(metadata.get("num_samples"), 420) self.assertEqual(metadata.get("num_experiments"), 1) # Make sure the data were quantile normalized self.assertTrue(metadata.get("quantile_normalized")) self.assertIn("ks_statistic", final_context) self.assertIn("ks_pvalue", final_context) self.assertEqual(final_context["ks_pvalue"], 1.0)
def test_create_compendia_danio(self): job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1234" experiment.save() result = ComputationalResult() result.save() qn_target = ComputedFile() qn_target.filename = "danio_target.tsv" qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv" qn_target.is_qn_target = True qn_target.size_in_bytes = "12345" qn_target.sha1 = "aabbccddeeff" qn_target.result = result qn_target.save() danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1, qn_target=result) danio_rerio.save() cra = ComputationalResultAnnotation() cra.data = {} cra.data["organism_id"] = danio_rerio.id cra.data["is_qn"] = True cra.result = result cra.save() result = ComputationalResult() result.save() micros = [] for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"): if "microarray.txt" in file: continue create_sample_for_experiment( { "organism": danio_rerio, "accession_code": file, "technology": "MICROARRAY", "filename": file, "data_dir": "/home/user/data_store/raw/TEST/MICROARRAY/", }, experiment, ) micros.append(file) experiment = Experiment() experiment.accession_code = "GSE5678" experiment.save() result = ComputationalResult() result.save() rnas = [] for file in os.listdir("/home/user/data_store/raw/TEST/RNASEQ/"): if "rnaseq.txt" in file: continue create_sample_for_experiment( { "organism": danio_rerio, "accession_code": file, "technology": "RNA-SEQ", "filename": file, "data_dir": "/home/user/data_store/raw/TEST/RNASEQ/", }, experiment, ) rnas.append(file) # Missing sample that will be filtered sample = create_sample_for_experiment( { "organism": danio_rerio, "accession_code": "GSM1487222", "title": "this sample will be filtered", "technology": "RNA-SEQ", "filename": None, }, experiment, ) rnas.append(sample.accession_code) dset = Dataset() dset.data = {"GSE1234": micros, "GSE5678": rnas} dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = True dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) self.assertSucceeded(job) # Verify result self.assertEqual( final_context["compendium_result"].result.computedfile_set.count(), 1) for file in final_context[ "compendium_result"].result.computedfile_set.all(): self.assertTrue(os.path.exists(file.absolute_file_path)) # test compendium_result self.assertEqual(final_context["compendium_result"].svd_algorithm, "ARPACK") self.assertEqual( final_context["compendium_result"].primary_organism.name, final_context["organism_name"], ) self.assertEqual( final_context["compendium_result"].primary_organism.name, "DANIO_RERIO") self.assertEqual(final_context["compendium_result"].organisms.count(), 1) self.assertEqual(len(final_context["filtered_samples"]), 10) # check that sample with no computed file was skipped self.assertTrue("GSM1487222" in final_context["filtered_samples"]) self.assertEqual( final_context["filtered_samples"]["GSM1487222"] ["experiment_accession_code"], "GSE5678") self.assertIn( "This sample did not have a processed file", final_context["filtered_samples"]["GSM1487222"]["reason"], ) # check that the 9 files with lots of missing measurements were filtered self.assertEqual( len( list( filter( lambda x: "less than 50% present values" in x["reason" ], final_context["filtered_samples"].values(), ))), 9, ) zf = zipfile.ZipFile(final_context["compendium_result"].result. computedfile_set.first().absolute_file_path) with zf.open("aggregated_metadata.json") as f: metadata = json.load(f) self.assertFalse(metadata.get("quant_sf_only")) self.assertEqual(metadata.get("compendium_version"), 1) # 420 microarray + 420 RNA seq # -1 that is filtered for a missing file # -9 that are filtered for having less than 50% present values self.assertEqual(metadata.get("num_samples"), 830) self.assertEqual(metadata.get("num_experiments"), 2) # Make sure the data were quantile normalized self.assertTrue(metadata.get("quantile_normalized")) self.assertIn("ks_statistic", final_context) self.assertIn("ks_pvalue", final_context) self.assertEqual(final_context["ks_pvalue"], 1.0)