def create_qn_target(organism, platform, create_results=True): sample_codes_results = Sample.processed_objects.filter( platform_accession_code=platform, has_raw=True, technology="MICROARRAY", organism=organism, is_processed=True, ).values("accession_code") sample_codes = [res["accession_code"] for res in sample_codes_results] dataset = Dataset() dataset.data = {organism.name + "_(" + platform + ")": sample_codes} dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False dataset.save() job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() return qn_reference.create_qn_reference(job.pk, create_results=create_results)
def test_notify(self): ds = Dataset() ds.data = {'GSM1487313': ['GSM1487313'], 'SRS332914': ['SRS332914']} ds.aggregate_by = 'SPECIES' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() job_context = {} job_context['job'] = pj job_context['dataset'] = ds job_context['upload'] = True job_context[ 'result_url'] = 'https://s3.amazonaws.com/data-refinery-test-assets/all_the_things.jpg' final_context = smasher._notify(job_context) self.assertTrue(final_context.get('success', True))
def create_job_for_organism(organism: Organism): """Returns a quantpendia job for the provided organism.""" job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_QUANTPENDIA.value job.save() dset = Dataset() dset.data = build_dataset(organism) dset.scale_by = "NONE" dset.aggregate_by = "EXPERIMENT" dset.quantile_normalize = False dset.quant_sf_only = True dset.svd_algorithm = "NONE" dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() # Have to call this after setting the dataset since it's used in # the caclulation. job.ram_amount = determine_ram_amount(job) job.save() return job
def create_job_for_organism(organisms: List[Organism], svd_algorithm="ARPACK"): """Returns a compendia job for the provided organism. Fetch all of the experiments and compile large but normally formated Dataset. """ job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() dataset = Dataset() dataset.data = get_dataset(organisms) dataset.scale_by = "NONE" dataset.aggregate_by = "SPECIES" dataset.quantile_normalize = True dataset.quant_sf_only = False dataset.svd_algorithm = svd_algorithm dataset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() # Have to call this after setting the dataset since it's used in # the caclulation. job.ram_amount = determine_ram_amount(job) job.save() return job
def test_fail(self): """ Test our ability to fail """ result = ComputationalResult() result.save() sample = Sample() sample.accession_code = 'XXX' sample.title = 'XXX' sample.organism = Organism.get_object_for_name("HOMO_SAPIENS") sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "NOT_REAL.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['XXX']} ds.aggregate_by = 'EXPERIMENT' ds.scale_by = 'MINMAX' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() dsid = ds.id job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() final_context = smasher.smash(job.pk, upload=False) ds = Dataset.objects.get(id=dsid) print(ds.failure_reason) print(final_context['dataset'].failure_reason) self.assertNotEqual(final_context['unsmashable_files'], [])
def handle(self, *args, **options): """ For every (or a supplied) organism, fetch all of the experiments and compile large but normally formated Dataset. Send all of them to the Smasher. Smash them. Retrieve manually as desired. """ dataset_ids = [] if options["organism"] is None: all_organisms = Organism.objects.all() else: all_organisms = [ Organism.get_object_for_name(options["organism"].upper()) ] for organism in all_organisms: data = {} experiments = Experiment.objects.filter( id__in=(ExperimentOrganismAssociation.objects.filter( organism=organism)).values('experiment')) for experiment in experiments: data[experiment.accession_code] = list( experiment.samples.filter(organism=organism).values_list( 'accession_code', flat=True)) job = ProcessorJob() job.pipeline_applied = "COMPENDIA" job.save() dset = Dataset() dset.data = data dset.scale_by = 'NONE' dset.aggregate_by = 'SPECIES' dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) sys.exit(0)
def dispatch_job(self, serializer, obj): processor_job = ProcessorJob() processor_job.pipeline_applied = "SMASHER" processor_job.ram_amount = 4096 processor_job.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = processor_job pjda.dataset = obj pjda.save() job_sent = False try: # Hidden method of non-dispatching for testing purposes. if not self.request.data.get("no_send_job", False): job_sent = send_job(ProcessorPipeline.SMASHER, processor_job) else: # We didn't actually send it, but we also didn't want to. job_sent = True except Exception as e: # Just log whatever exception happens, because the foreman wil requeue the job anyway logger.error(e) if not job_sent: raise APIException( "Unable to queue download job. Something has gone" " wrong and we have been notified about it." ) serializer.validated_data["is_processing"] = True obj = serializer.save() # create a new dataset annotation with the information of this request annotation = DatasetAnnotation() annotation.dataset = obj annotation.data = { "start": True, "ip": get_client_ip(self.request), "user_agent": self.request.META.get("HTTP_USER_AGENT", None), } annotation.save()
def create_job_for_organism(organism: Organism): """Returns a quantpendia job for the provided organism.""" job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_QUANTPENDIA.value job.save() dset = Dataset() dset.data = build_dataset(organism) dset.scale_by = "NONE" dset.aggregate_by = "EXPERIMENT" dset.quantile_normalize = False dset.quant_sf_only = True dset.svd_algorithm = "NONE" dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() return job
def test_create_compendia_danio(self): job = ProcessorJob() job.pipeline_applied = "COMPENDIA" job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1234" experiment.save() result = ComputationalResult() result.save() danio_rerio = Organism.get_object_for_name("DANIO_RERIO") micros = [] for file in os.listdir('/home/user/data_store/raw/TEST/MICROARRAY/'): if 'microarray.txt' in file: continue sample = Sample() sample.accession_code = file sample.title = file sample.organism = danio_rerio sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = file computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/MICROARRAY/" + file computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() micros.append(file) experiment = Experiment() experiment.accession_code = "GSE5678" experiment.save() result = ComputationalResult() result.save() rnas = [] for file in os.listdir('/home/user/data_store/raw/TEST/RNASEQ/'): if 'rnaseq.txt' in file: continue sample = Sample() sample.accession_code = file sample.title = file sample.organism = danio_rerio sample.technology = "RNASEQ" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = file computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/RNASEQ/" + file computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() rnas.append(file) result = ComputationalResult() result.save() qn_target = ComputedFile() qn_target.filename = "danio_target.tsv" qn_target.absolute_file_path = '/home/user/data_store/QN/danio_target.tsv' qn_target.is_qn_target = True qn_target.size_in_bytes = "12345" qn_target.sha1 = "aabbccddeeff" qn_target.result = result qn_target.save() cra = ComputationalResultAnnotation() cra.data = {} cra.data['organism_id'] = danio_rerio.id cra.data['is_qn'] = True cra.result = result cra.save() dset = Dataset() dset.data = {'GSE1234': micros, 'GSE5678': rnas} dset.scale_by = 'NONE' dset.aggregate_by = 'SPECIES' dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) # Verify result self.assertEqual(len(final_context['computed_files']), 3) for file in final_context['computed_files']: self.assertTrue(os.path.exists(file.absolute_file_path))
def prepare_job(): pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sample_annotation = SampleAnnotation() sample_annotation.data = {'hi': 'friend'} sample_annotation.sample = sample sample_annotation.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1237810_T09-1084.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1237812' sample.title = 'GSM1237812' sample.organism = homo_sapiens sample.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "GSM1237812_S97-PURE.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() computed_file = ComputedFile() computed_file.filename = "GSM1237812_S97-PURE.DAT" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = False computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812']} ds.aggregate_by = 'EXPERIMENT' # [ALL or SPECIES or EXPERIMENT] ds.scale_by = 'STANDARD' # [NONE or MINMAX or STANDARD or ROBUST] ds.email_address = "*****@*****.**" #ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() return pj
def test_bad_overlap(self): pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sample_annotation = SampleAnnotation() sample_annotation.data = {'hi': 'friend'} sample_annotation.sample = sample sample_annotation.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "big.PCL" computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1237812' sample.title = 'GSM1237812' sample.organism = homo_sapiens sample.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "small.PCL" computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812']} ds.aggregate_by = 'ALL' # [ALL or SPECIES or EXPERIMENT] ds.scale_by = 'NONE' # [NONE or MINMAX or STANDARD or ROBUST] ds.email_address = "*****@*****.**" #ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) ds = Dataset.objects.get(id=ds.id) pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() # Now, make sure the bad can't zero this out. sample = Sample() sample.accession_code = 'GSM999' sample.title = 'GSM999' sample.organism = homo_sapiens sample.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "bad.PCL" computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812', 'GSM999']} ds.aggregate_by = 'ALL' # [ALL or SPECIES or EXPERIMENT] ds.scale_by = 'NONE' # [NONE or MINMAX or STANDARD or ROBUST] ds.email_address = "*****@*****.**" #ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) ds = Dataset.objects.get(id=ds.id) self.assertEqual(len(final_context['final_frame']), 4)
def test_create_compendia(self): job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1487313" experiment.save() result = ComputationalResult() result.save() gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS", taxonomy_id=1001) sample = Sample() sample.accession_code = "GSM1487313" sample.title = "GSM1487313" sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487313_liver.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # Missing sample that will be filtered sample = Sample() sample.accession_code = "GSM1487222" sample.title = "this sample will be filtered" sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487222_empty.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/doesnt_exists.PCL" computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # RNASEQ TECH experiment2 = Experiment() experiment2.accession_code = "SRS332914" experiment2.save() result2 = ComputationalResult() result2.save() sample2 = Sample() sample2.accession_code = "SRS332914" sample2.title = "SRS332914" sample2.organism = gallus_gallus sample2.technology = "RNA-SEQ" sample2.save() sra2 = SampleResultAssociation() sra2.sample = sample2 sra2.result = result2 sra2.save() esa2 = ExperimentSampleAssociation() esa2.experiment = experiment2 esa2.sample = sample2 esa2.save() computed_file2 = ComputedFile() computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv" computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename computed_file2.result = result2 computed_file2.size_in_bytes = 234 computed_file2.is_smashable = True computed_file2.save() assoc2 = SampleComputedFileAssociation() assoc2.sample = sample2 assoc2.computed_file = computed_file2 assoc2.save() dset = Dataset() dset.data = { "GSE1487313": ["GSM1487313", "GSM1487222"], "SRX332914": ["SRS332914"] } dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) self.assertFalse(job.success) # check that sample with no computed file was skipped self.assertTrue("GSM1487222" in final_context["filtered_samples"]) self.assertEqual( final_context["filtered_samples"]["GSM1487222"] ["experiment_accession_code"], "GSE1487313", )
def dispatch_qn_job_if_eligible(organism: Organism) -> None: """Checks if the organism is elgible for a QN job and if so dispatches it. An organism is eligible for a QN job if it has more than MIN samples on a single platform. """ samples = Sample.processed_objects.filter( organism=organism, has_raw=True, technology="MICROARRAY", is_processed=True, platform_name__contains="Affymetrix", ) if samples.count() < MIN: logger.info( "Total proccessed samples don't meet minimum threshhold", organism=organism, count=samples.count(), min=MIN, ) return platform_counts = ( samples.values("platform_accession_code") .annotate(dcount=Count("platform_accession_code")) .order_by("-dcount") ) biggest_platform = platform_counts[0]["platform_accession_code"] sample_codes_results = Sample.processed_objects.filter( platform_accession_code=biggest_platform, has_raw=True, technology="MICROARRAY", organism=organism, is_processed=True, ).values("accession_code") if sample_codes_results.count() < MIN: logger.info( "Number of processed samples for largest platform didn't mean threshold.", organism=organism, platform_accession_code=biggest_platform, count=sample_codes_results.count(), min=MIN, ) return sample_codes = [res["accession_code"] for res in sample_codes_results] dataset = Dataset() dataset.data = {organism.name + "_(" + biggest_platform + ")": sample_codes} dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False dataset.save() job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() logger.info("Sending QN_REFERENCE for Organism", job_id=str(job.pk), organism=str(organism)) send_job(ProcessorPipeline.QN_REFERENCE, job) return job
def test_dualtech_smash(self): """ """ pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1487313" experiment.save() result = ComputationalResult() result.save() gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS") sample = Sample() sample.accession_code = 'GSM1487313' sample.title = 'GSM1487313' sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487313_liver.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # RNASEQ TECH experiment2 = Experiment() experiment2.accession_code = "SRS332914" experiment2.save() result2 = ComputationalResult() result2.save() sample2 = Sample() sample2.accession_code = 'SRS332914' sample2.title = 'SRS332914' sample2.organism = gallus_gallus sample2.technology = "RNA-SEQ" sample2.save() sra2 = SampleResultAssociation() sra2.sample = sample2 sra2.result = result2 sra2.save() esa2 = ExperimentSampleAssociation() esa2.experiment = experiment2 esa2.sample = sample2 esa2.save() computed_file2 = ComputedFile() computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv" computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename computed_file2.result = result2 computed_file2.size_in_bytes = 234 computed_file2.is_smashable = True computed_file2.save() assoc2 = SampleComputedFileAssociation() assoc2.sample = sample2 assoc2.computed_file = computed_file2 assoc2.save() # CROSS-SMASH BY SPECIES ds = Dataset() ds.data = {'GSE1487313': ['GSM1487313'], 'SRX332914': ['SRS332914']} ds.aggregate_by = 'SPECIES' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() self.assertTrue(ds.is_cross_technology()) final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(os.path.exists(final_context['output_file'])) os.remove(final_context['output_file']) self.assertEqual(len(final_context['final_frame'].columns), 2) # THEN BY EXPERIMENT ds.aggregate_by = 'EXPERIMENT' ds.save() dsid = ds.id ds = Dataset.objects.get(id=dsid) pj.start_time = None pj.end_time = None pj.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(os.path.exists(final_context['output_file'])) os.remove(final_context['output_file']) self.assertEqual(len(final_context['final_frame'].columns), 1) # THEN BY ALL ds.aggregate_by = 'ALL' ds.save() dsid = ds.id ds = Dataset.objects.get(id=dsid) pj.start_time = None pj.end_time = None pj.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(os.path.exists(final_context['output_file'])) self.assertEqual(len(final_context['final_frame'].columns), 2)
def test_create_compendia(self): DATA_DIR = "/home/user/data_store/PCL/" job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS", taxonomy_id=1001) # MICROARRAY TECH (experiment, _) = Experiment.objects.get_or_create(accession_code="GSE1487313") experiment.accession_code = "GSE1487313" experiment.save() create_sample_for_experiment( { "organism": gallus_gallus, "accession_code": "GSM1487313", "technology": "MICROARRAY", "filename": "GSM1487313_liver.PCL", "data_dir": DATA_DIR, }, experiment, ) # Missing sample that will be filtered create_sample_for_experiment( { "organism": gallus_gallus, "accession_code": "GSM1487222", "title": "this sample will be filtered", "technology": "MICROARRAY", "filename": "GSM1487222_empty.PCL", "data_dir": DATA_DIR, }, experiment, ) # RNASEQ TECH experiment2 = Experiment() experiment2.accession_code = "SRP149598" experiment2.save() create_sample_for_experiment( { "organism": gallus_gallus, "accession_code": "SRR7250867", "technology": "RNA-SEQ", "filename": "SRP149598_gene_lengthScaledTPM.tsv", "data_dir": DATA_DIR, }, experiment, ) dset = Dataset() dset.data = { "GSE1487313": ["GSM1487313", "GSM1487222"], "SRP149598": ["SRR7250867"], } dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = True dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) # Because one of the samples is filtered out, there will be too few # remaining samples to smash together, so we expect this job to fail. self.assertFailed(job, "k must be between 1 and min(A.shape)") # check that sample with no computed file was skipped self.assertTrue("GSM1487222" in final_context["filtered_samples"]) self.assertEqual( final_context["filtered_samples"]["GSM1487222"] ["experiment_accession_code"], "GSE1487313", )
def test_no_smash_all_diff_species(self): """ Smashing together with 'ALL' with different species is a really weird behavior. This test isn't really testing a normal case, just make sure that it's marking the unsmashable files. """ job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1237810_T09-1084.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() experiment = Experiment() experiment.accession_code = "GSE51084" experiment.save() mus_mus = Organism.get_object_for_name("MUS_MUSCULUS") sample = Sample() sample.accession_code = 'GSM1238108' sample.title = 'GSM1238108' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1238108-tbl-1.txt" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810'], 'GSE51084': ['GSM1238108']} ds.aggregate_by = 'ALL' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() final_context = smasher.smash(job.pk, upload=False) dsid = ds.id ds = Dataset.objects.get(id=dsid) print(ds.failure_reason) print(final_context['dataset'].failure_reason) self.assertEqual(final_context['unsmashable_files'], ['GSM1238108'])
def test_no_smash_dupe_two(self): """ Tests the SRP051449 case, where the titles collide. Also uses a real QN target file.""" job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() experiment = Experiment() experiment.accession_code = "SRP051449" experiment.save() result = ComputationalResult() result.save() danio_rerio = Organism.get_object_for_name("DANIO_RERIO") sample = Sample() sample.accession_code = 'SRR1731761' sample.title = 'Danio rerio' sample.organism = danio_rerio sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "SRR1731761_output_gene_lengthScaledTPM.tsv" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'SRR1731762' sample.title = 'Danio rerio' sample.organism = danio_rerio sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "SRR1731762_output_gene_lengthScaledTPM.tsv" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'SRP051449': ['SRR1731761', 'SRR1731762']} ds.aggregate_by = 'SPECIES' ds.scale_by = 'NONE' ds.email_address = "*****@*****.**" ds.quantile_normalize = True ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() cr = ComputationalResult() cr.save() computed_file = ComputedFile() computed_file.filename = "danio_target.tsv" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = cr computed_file.size_in_bytes = 123 computed_file.is_smashable = False computed_file.save() cra = ComputationalResultAnnotation() cra.data = {'organism_id': danio_rerio.id, 'is_qn': True} cra.result = cr cra.save() final_context = smasher.smash(job.pk, upload=False) self.assertTrue(final_context['success'])
def test_create_quantpendia(self): job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_QUANTPENDIA.value job.save() experiment = Experiment() experiment.accession_code = "GSE51088" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS", taxonomy_id=9606) sample = Sample() sample.accession_code = "GSM1237818" sample.title = "GSM1237818" sample.organism = homo_sapiens sample.technology = "RNA-SEQ" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.s3_key = "smasher-test-quant.sf" computed_file.s3_bucket = "data-refinery-test-assets" computed_file.filename = "quant.sf" computed_file.absolute_file_path = "/home/user/data_store/QUANT/smasher-test-quant.sf" computed_file.result = result computed_file.is_smashable = True computed_file.size_in_bytes = 123123 computed_file.sha1 = ( "08c7ea90b66b52f7cd9d9a569717a1f5f3874967" # this matches with the downloaded file ) computed_file.save() computed_file = ComputedFile() computed_file.filename = "logquant.tsv" computed_file.is_smashable = True computed_file.size_in_bytes = 123123 computed_file.result = result computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {"GSE51088": ["GSM1237818"]} ds.aggregate_by = "EXPERIMENT" ds.scale_by = "STANDARD" ds.email_address = "*****@*****.**" ds.quant_sf_only = True # Make the dataset include quant.sf files only ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() final_context = create_quantpendia(job.id) self.assertTrue( os.path.exists(final_context["output_dir"] + "/GSE51088/GSM1237818_quant.sf")) self.assertTrue( os.path.exists(final_context["output_dir"] + "/README.md")) self.assertTrue( os.path.exists(final_context["output_dir"] + "/LICENSE.TXT")) self.assertTrue( os.path.exists(final_context["output_dir"] + "/aggregated_metadata.json")) self.assertTrue(final_context["metadata"]["quant_sf_only"]) self.assertEqual(final_context["metadata"]["num_samples"], 1) self.assertEqual(final_context["metadata"]["num_experiments"], 1) # test that archive exists quantpendia_file = ComputedFile.objects.filter( is_compendia=True, quant_sf_only=True).latest() self.assertTrue(os.path.exists(quantpendia_file.absolute_file_path))
def test_qn_reference(self): # We don't have a 0.tsv experiment = prepare_experiment(range(1, 201)) job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() dataset = Dataset() dataset.data = {"12345": ["1", "2", "3", "4", "5", "6"]} dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False # We don't QN because we're creating the target now dataset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() final_context = qn_reference.create_qn_reference(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["target_file"])) self.assertEqual(os.path.getsize(final_context["target_file"]), 562) homo_sapiens = Organism.objects.get(taxonomy_id=9606) target = homo_sapiens.qn_target.computedfile_set.latest() self.assertEqual(target.sha1, "de69d348f8b239479e2330d596c4013a7b0b2b6a") # Create and run a smasher job that will use the QN target we just made. pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() ds = Dataset() ds.data = {"12345": ["1", "2", "3", "4", "5"]} ds.aggregate_by = "SPECIES" ds.scale_by = "STANDARD" ds.email_address = "*****@*****.**" ds.quantile_normalize = True ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(final_context["success"]) np.testing.assert_almost_equal(final_context["merged_qn"]["1"][0], -0.4379488527774811) np.testing.assert_almost_equal( final_context["original_merged"]["1"][0], -0.5762109) # Make sure that the results were created. We create 200 computed files # and computational results (1 for each sample) plus the one generated # by the QN reference processor. self.assertEqual(ComputedFile.objects.all().count(), 200 + 1) self.assertEqual(ComputationalResult.objects.all().count(), 200 + 1) self.assertEqual(ComputationalResultAnnotation.objects.all().count(), 1)
def handle(self, *args, **options): """ """ if not options["job_id"]: if options["organism"] is None and not options["all"]: logger.error("You must specify an organism or --all") sys.exit(1) if options["organism"] and (options.get("organism", "") != "ALL"): organisms = [ Organism.get_object_for_name(options["organism"].upper()) ] else: organisms = Organism.objects.all() for organism in organisms: if not organism_can_have_qn_target(organism): logger.error( "Organism does not have any platform with enough samples to generate a qn target", organism=organism, min=options["min"], ) continue samples = organism.sample_set.filter(has_raw=True, technology="MICROARRAY", is_processed=True) if samples.count() == 0: logger.error( "No processed samples for organism.", organism=organism, count=samples.count(), ) continue if options["platform"] is None: platform_counts = ( samples.values("platform_accession_code").annotate( dcount=Count("platform_accession_code")).order_by( "-dcount")) biggest_platform = platform_counts[0][ "platform_accession_code"] else: biggest_platform = options["platform"] sample_codes_results = Sample.processed_objects.filter( platform_accession_code=biggest_platform, has_raw=True, technology="MICROARRAY", organism=organism, is_processed=True, ).values("accession_code") sample_codes = [ res["accession_code"] for res in sample_codes_results ] dataset = Dataset() dataset.data = { organism.name + "_(" + biggest_platform + ")": sample_codes } dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False dataset.save() job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() final_context = qn_reference.create_qn_reference(job.pk) if final_context["success"]: print(":D") self.stdout.write("Target file: " + final_context["target_file"]) self.stdout.write( "Target S3: " + str(final_context["computed_files"][0].get_s3_url())) else: print(":(") else: qn_reference.create_qn_reference(options["job_id"])
def test_qn_reference(self): job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") experiment = Experiment() experiment.accession_code = "12345" experiment.save() for code in ['1', '2', '3', '4', '5', '6']: sample = Sample() sample.accession_code = code sample.title = code sample.platform_accession_code = 'A-MEXP-1171' sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS" sample.organism = homo_sapiens sample.technology = "MICROARRAY" sample.is_processed = True sample.save() cr = ComputationalResult() cr.save() file = ComputedFile() file.filename = code + ".tsv" file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv" file.size_in_bytes = int(code) file.result = cr file.is_smashable = True file.save() scfa = SampleComputedFileAssociation() scfa.sample = sample scfa.computed_file = file scfa.save() exsa = ExperimentSampleAssociation() exsa.experiment = experiment exsa.sample = sample exsa.save() dataset = Dataset() dataset.data = {"12345": ["1", "2", "3", "4", "5", "6"]} dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False # We don't QN because we're creating the target now dataset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() final_context = qn_reference.create_qn_reference(job.pk) self.assertTrue(final_context['success']) self.assertTrue(os.path.exists(final_context['target_file'])) self.assertEqual(os.path.getsize(final_context['target_file']), 556) target = utils.get_most_recent_qn_target_for_organism(homo_sapiens) self.assertEqual(target.sha1, '636d72d5cbf4b9785b0bd271a1430b615feaa7ea') ### # Smasher with QN ### pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() ds = Dataset() ds.data = {"12345": ["1", "2", "3", "4", "5"]} ds.aggregate_by = 'SPECIES' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = True ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(final_context['success']) self.assertEqual(final_context['merged_qn']['1'][0], -0.4379488528812934) self.assertEqual(final_context['original_merged']['1'][0], -0.576210936113982) ## # Test via management command ## from django.core.management import call_command from django.test import TestCase from django.utils.six import StringIO out = StringIO() try: call_command('create_qn_target', organism='homo_sapiens', min=1, stdout=out) except SystemExit as e: # this is okay! pass stdout = out.getvalue() self.assertTrue('Target file' in stdout) path = stdout.split('\n')[0].split(':')[1].strip() self.assertTrue(os.path.exists(path)) self.assertEqual(path, utils.get_most_recent_qn_target_for_organism(homo_sapiens).absolute_file_path)
def handle(self, *args, **options): """ """ if options["organism"] is None and not options["all"]: logger.error("You must specify an organism or --all") sys.exit(1) if options["organism"] and (options.get('organism', '') != "ALL"): organisms = [ Organism.get_object_for_name(options["organism"].upper()) ] else: organisms = Organism.objects.all() for organism in organisms: samples = Sample.processed_objects.filter(organism=organism, has_raw=True, technology="MICROARRAY", is_processed=True) if samples.count() == 0: logger.error("No processed samples for organism.", organism=organism, count=samples.count()) continue if samples.count() < options['min']: logger.error( "Proccessed samples don't meet minimum threshhold", organism=organism, count=samples.count(), min=options["min"]) continue if options["platform"] is None: platform_counts = samples.values( 'platform_accession_code').annotate(dcount=Count( 'platform_accession_code')).order_by('-dcount') biggest_platform = platform_counts[0][ 'platform_accession_code'] else: biggest_platform = options["platform"] sample_codes_results = Sample.processed_objects.filter( platform_accession_code=biggest_platform, has_raw=True, technology="MICROARRAY", is_processed=True).values('accession_code') sample_codes = [ res['accession_code'] for res in sample_codes_results ] dataset = Dataset() dataset.data = { organism.name + '_(' + biggest_platform + ')': sample_codes } dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False dataset.save() job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() final_context = qn_reference.create_qn_reference(job.pk) if final_context['success']: print(":D") self.stdout.write("Target file: " + final_context['target_file']) self.stdout.write( "Target S3: " + str(final_context['computed_files'][0].get_s3_url())) else: print(":(")
def test_no_smash_dupe(self): """ """ job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1237810_T09-1084.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1237811' sample.title = 'GSM1237811' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237811']} ds.aggregate_by = 'ALL' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() final_context = smasher.smash(job.pk, upload=False) dsid = ds.id ds = Dataset.objects.get(id=dsid) self.assertTrue(ds.success) for column in final_context['original_merged'].columns: self.assertTrue('_x' not in column)
def test_create_compendia(self): job = ProcessorJob() job.pipeline_applied = "COMPENDIA" job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1487313" experiment.save() result = ComputationalResult() result.save() gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS") sample = Sample() sample.accession_code = 'GSM1487313' sample.title = 'GSM1487313' sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487313_liver.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # RNASEQ TECH experiment2 = Experiment() experiment2.accession_code = "SRS332914" experiment2.save() result2 = ComputationalResult() result2.save() sample2 = Sample() sample2.accession_code = 'SRS332914' sample2.title = 'SRS332914' sample2.organism = gallus_gallus sample2.technology = "RNA-SEQ" sample2.save() sra2 = SampleResultAssociation() sra2.sample = sample2 sra2.result = result2 sra2.save() esa2 = ExperimentSampleAssociation() esa2.experiment = experiment2 esa2.sample = sample2 esa2.save() computed_file2 = ComputedFile() computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv" computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename computed_file2.result = result2 computed_file2.size_in_bytes = 234 computed_file2.is_smashable = True computed_file2.save() assoc2 = SampleComputedFileAssociation() assoc2.sample = sample2 assoc2.computed_file = computed_file2 assoc2.save() dset = Dataset() dset.data = {'GSE1487313': ['GSM1487313'], 'SRX332914': ['SRS332914']} dset.scale_by = 'NONE' dset.aggregate_by = 'SPECIES' dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id)
def test_log2(self): pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() # Has non-log2 data: # https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE44421 # ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE44nnn/GSE44421/miniml/GSE44421_family.xml.tgz experiment = Experiment() experiment.accession_code = "GSE44421" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1084806' sample.title = 'GSM1084806' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1084806-tbl-1.txt" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1084807' sample.title = 'GSM1084807' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1084807-tbl-1.txt" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE44421': ['GSM1084806', 'GSM1084807']} ds.aggregate_by = 'EXPERIMENT' ds.scale_by = 'MINMAX' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) ds = Dataset.objects.get(id=ds.id) self.assertTrue(final_context['success'])
def test_qn_reference(self): job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606) homo_sapiens.save() experiment = Experiment() experiment.accession_code = "12345" experiment.save() # We don't have a 0.tsv codes = [str(i) for i in range(1, 201)] for code in codes: sample = Sample() sample.accession_code = code sample.title = code sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS" sample.organism = homo_sapiens sample.technology = "MICROARRAY" sample.is_processed = True sample.save() cr = ComputationalResult() cr.save() computed_file = ComputedFile() computed_file.filename = code + ".tsv" computed_file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv" computed_file.size_in_bytes = int(code) computed_file.result = cr computed_file.is_smashable = True computed_file.save() scfa = SampleComputedFileAssociation() scfa.sample = sample scfa.computed_file = computed_file scfa.save() exsa = ExperimentSampleAssociation() exsa.experiment = experiment exsa.sample = sample exsa.save() dataset = Dataset() dataset.data = {"12345": ["1", "2", "3", "4", "5", "6"]} dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False # We don't QN because we're creating the target now dataset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() final_context = qn_reference.create_qn_reference(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["target_file"])) self.assertEqual(os.path.getsize(final_context["target_file"]), 562) homo_sapiens.refresh_from_db() target = homo_sapiens.qn_target.computedfile_set.latest() self.assertEqual(target.sha1, "de69d348f8b239479e2330d596c4013a7b0b2b6a") # Create and run a smasher job that will use the QN target we just made. pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() ds = Dataset() ds.data = {"12345": ["1", "2", "3", "4", "5"]} ds.aggregate_by = "SPECIES" ds.scale_by = "STANDARD" ds.email_address = "*****@*****.**" ds.quantile_normalize = True ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(final_context["success"]) np.testing.assert_almost_equal(final_context["merged_qn"]["1"][0], -0.4379488527774811) np.testing.assert_almost_equal(final_context["original_merged"]["1"][0], -0.5762109)
def test_create_compendia_microarray_only(self): """ Make sure that we can actually create a compendium with just microarray samples. """ job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1234" experiment.save() result = ComputationalResult() result.save() qn_target = ComputedFile() qn_target.filename = "danio_target.tsv" qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv" qn_target.is_qn_target = True qn_target.size_in_bytes = "12345" qn_target.sha1 = "aabbccddeeff" qn_target.result = result qn_target.save() danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1, qn_target=result) danio_rerio.save() cra = ComputationalResultAnnotation() cra.data = {} cra.data["organism_id"] = danio_rerio.id cra.data["is_qn"] = True cra.result = result cra.save() result = ComputationalResult() result.save() micros = [] for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"): if "microarray.txt" in file: continue create_sample_for_experiment( { "organism": danio_rerio, "accession_code": file, "technology": "MICROARRAY", "filename": file, "data_dir": "/home/user/data_store/raw/TEST/MICROARRAY/", }, experiment, ) micros.append(file) dset = Dataset() dset.data = {"GSE1234": micros} dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = True dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) self.assertSucceeded(job) # Verify result self.assertEqual( final_context["compendium_result"].result.computedfile_set.count(), 1) for file in final_context[ "compendium_result"].result.computedfile_set.all(): self.assertTrue(os.path.exists(file.absolute_file_path)) # test compendium_result self.assertEqual(final_context["compendium_result"].svd_algorithm, "ARPACK") self.assertEqual( final_context["compendium_result"].primary_organism.name, final_context["organism_name"], ) self.assertEqual( final_context["compendium_result"].primary_organism.name, "DANIO_RERIO") self.assertEqual(final_context["compendium_result"].organisms.count(), 1) zf = zipfile.ZipFile(final_context["compendium_result"].result. computedfile_set.first().absolute_file_path) with zf.open("aggregated_metadata.json") as f: metadata = json.load(f) self.assertFalse(metadata.get("quant_sf_only")) # 420 microarray self.assertEqual(metadata.get("num_samples"), 420) self.assertEqual(metadata.get("num_experiments"), 1) # Make sure the data were quantile normalized self.assertTrue(metadata.get("quantile_normalized")) self.assertIn("ks_statistic", final_context) self.assertIn("ks_pvalue", final_context) self.assertEqual(final_context["ks_pvalue"], 1.0)
def test_imputation(self): job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1234" experiment.save() result = ComputationalResult() result.save() qn_target = ComputedFile() qn_target.filename = "danio_target.tsv" qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv" qn_target.is_qn_target = True qn_target.size_in_bytes = "12345" qn_target.sha1 = "aabbccddeeff" qn_target.result = result qn_target.save() danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1, qn_target=result) danio_rerio.save() cra = ComputationalResultAnnotation() cra.data = {} cra.data["organism_id"] = danio_rerio.id cra.data["is_qn"] = True cra.result = result cra.save() result = ComputationalResult() result.save() micros = [] for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"): if "microarray.txt" in file: continue create_sample_for_experiment( { "organism": danio_rerio, "accession_code": file, "technology": "MICROARRAY", "filename": file, "data_dir": "/home/user/data_store/raw/TEST/MICROARRAY/", }, experiment, ) micros.append(file) experiment = Experiment() experiment.accession_code = "GSE5678" experiment.save() result = ComputationalResult() result.save() rnas = [] for file in os.listdir("/home/user/data_store/raw/TEST/RNASEQ/"): if "rnaseq.txt" in file: continue create_sample_for_experiment( { "organism": danio_rerio, "accession_code": file, "technology": "RNA-SEQ", "filename": file, "data_dir": "/home/user/data_store/raw/TEST/RNASEQ/", }, experiment, ) rnas.append(file) # Missing sample that will be filtered sample = create_sample_for_experiment( { "organism": danio_rerio, "accession_code": "GSM1487222", "title": "this sample will be filtered", "technology": "RNA-SEQ", "filename": None, }, experiment, ) rnas.append(sample.accession_code) dset = Dataset() dset.data = {"GSE1234": micros, "GSE5678": rnas} dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = True dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() imputation_index = create_compendia.COMPENDIA_PIPELINE.index( create_compendia._perform_imputation) pipeline = Pipeline(name=PipelineEnum.CREATE_COMPENDIA.value) job_context = utils.run_pipeline( { "job_id": job.id, "pipeline": pipeline }, create_compendia.COMPENDIA_PIPELINE[:imputation_index], ) # First, run the imputation step without removing anything to get a baseline expected_context = utils.run_pipeline( job_context.copy(), [create_compendia.COMPENDIA_PIPELINE[imputation_index]]) # Now pick some rows to remove according to the instructions from # https://github.com/AlexsLemonade/refinebio/pull/2879#issuecomment-895143336 random.seed(42) # Select some rows randomly and mask a little bit less than 30% of the values rare_rows = random.sample(list(job_context["microarray_matrix"].index), k=25) rare_genes = {} for row in rare_rows: cols = random.sample( list(job_context["microarray_matrix"].columns), # There are around 840 samples, and we want to pick a little bit # less than 30% of them k=int(0.28 * 840), ) rare_genes[row] = cols for col in cols: job_context["microarray_matrix"].loc[row, col] = np.nan # Now randomly select some entries from the other rows to mask individual_indices = random.sample( list( itertools.product( set(job_context["microarray_matrix"].index) - set(rare_rows), job_context["microarray_matrix"].columns, )), k=1000, ) for row, col in individual_indices: job_context["microarray_matrix"].loc[row, col] = np.nan final_context = utils.run_pipeline( job_context, [create_compendia.COMPENDIA_PIPELINE[imputation_index]]) self.assertDidNotFail(job) index = set(final_context["merged_no_qn"].index) & set( expected_context["merged_no_qn"].index) columns = set(final_context["merged_no_qn"].columns) & set( expected_context["merged_no_qn"].columns) # Calculate the Root-Mean-Square Error (RMSE) of the imputed values. # See https://en.wikipedia.org/wiki/Root-mean-square_deviation # for a description of the formula. N = 0 squared_error = 0 affected_entries = { *individual_indices, *((row, col) for row, cols in rare_genes.items() for col in cols), } for row, col in affected_entries: if row in index and col in columns: actual = final_context["merged_no_qn"].loc[row, col] expected = expected_context["merged_no_qn"].loc[row, col] N += 1 squared_error += (actual - expected)**2 rmse = math.sqrt(squared_error / N) # The results of a previous run plus a little bit of leeway self.assertLess(abs(rmse - 0.2868600293662542), 0.05)
def handle(self, *args, **options): """ Dispatch QN_REFERENCE creation jobs for all Organisms with a platform with enough processed samples. """ organisms = Organism.objects.all() for organism in organisms: samples = Sample.processed_objects.filter( organism=organism, has_raw=True, technology="MICROARRAY", is_processed=True, platform_name__contains="Affymetrix", ) if samples.count() < MIN: logger.info( "Total proccessed samples don't meet minimum threshhold", organism=organism, count=samples.count(), min=MIN, ) continue platform_counts = ( samples.values("platform_accession_code").annotate( dcount=Count("platform_accession_code")).order_by( "-dcount")) biggest_platform = platform_counts[0]["platform_accession_code"] sample_codes_results = Sample.processed_objects.filter( platform_accession_code=biggest_platform, has_raw=True, technology="MICROARRAY", organism=organism, is_processed=True, ).values("accession_code") if sample_codes_results.count() < MIN: logger.info( "Number of processed samples for largest platform didn't mean threshold.", organism=organism, platform_accession_code=biggest_platform, count=sample_codes_results.count(), min=MIN, ) continue sample_codes = [ res["accession_code"] for res in sample_codes_results ] dataset = Dataset() dataset.data = { organism.name + "_(" + biggest_platform + ")": sample_codes } dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False dataset.save() job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() logger.info("Sending QN_REFERENCE for Organism", job_id=str(job.pk), organism=str(organism)) send_job(ProcessorPipeline.QN_REFERENCE, job)
def test_create_compendia_danio(self): job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1234" experiment.save() result = ComputationalResult() result.save() qn_target = ComputedFile() qn_target.filename = "danio_target.tsv" qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv" qn_target.is_qn_target = True qn_target.size_in_bytes = "12345" qn_target.sha1 = "aabbccddeeff" qn_target.result = result qn_target.save() danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1, qn_target=result) danio_rerio.save() cra = ComputationalResultAnnotation() cra.data = {} cra.data["organism_id"] = danio_rerio.id cra.data["is_qn"] = True cra.result = result cra.save() result = ComputationalResult() result.save() micros = [] for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"): if "microarray.txt" in file: continue sample = Sample() sample.accession_code = file sample.title = file sample.organism = danio_rerio sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = file computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/MICROARRAY/" + file computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() micros.append(file) experiment = Experiment() experiment.accession_code = "GSE5678" experiment.save() result = ComputationalResult() result.save() rnas = [] for file in os.listdir("/home/user/data_store/raw/TEST/RNASEQ/"): if "rnaseq.txt" in file: continue sample = Sample() sample.accession_code = file sample.title = file sample.organism = danio_rerio sample.technology = "RNASEQ" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = file computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/RNASEQ/" + file computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() rnas.append(file) # Missing sample that will be filtered sample = Sample() sample.accession_code = "GSM1487222" sample.title = "this sample will be filtered" sample.organism = danio_rerio sample.technology = "RNASEQ" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() rnas.append(sample.accession_code) dset = Dataset() dset.data = {"GSE1234": micros, "GSE5678": rnas} dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) # Verify result self.assertEqual( final_context["compendium_result"].result.computedfile_set.count(), 1) for file in final_context[ "compendium_result"].result.computedfile_set.all(): self.assertTrue(os.path.exists(file.absolute_file_path)) # test compendium_result self.assertEqual(final_context["compendium_result"].svd_algorithm, "ARPACK") self.assertEqual( final_context["compendium_result"].primary_organism.name, final_context["organism_name"]) self.assertEqual( final_context["compendium_result"].primary_organism.name, "DANIO_RERIO") self.assertEqual(final_context["compendium_result"].organisms.count(), 1) # check that sample with no computed file was skipped self.assertTrue("GSM1487222" in final_context["filtered_samples"]) self.assertEqual( final_context["filtered_samples"]["GSM1487222"] ["experiment_accession_code"], "GSE5678")