def handle(self, *args, **options): if options["job_id"] is None: logger.error("You must specify a job ID.", job_id=options["job_id"]) sys.exit(1) try: job_type = ProcessorPipeline[options["job_name"]] except KeyError: logger.error("You must specify a valid job name.", job_name=options["job_name"], job_id=options["job_id"]) sys.exit(1) if job_type is ProcessorPipeline.AFFY_TO_PCL: from data_refinery_workers.processors.array_express import affy_to_pcl affy_to_pcl(options["job_id"]) elif job_type is ProcessorPipeline.TRANSCRIPTOME_INDEX_SHORT: from data_refinery_workers.processors.transcriptome_index import build_transcriptome_index build_transcriptome_index(options["job_id"], length="short") elif job_type is ProcessorPipeline.TRANSCRIPTOME_INDEX_LONG: from data_refinery_workers.processors.transcriptome_index import build_transcriptome_index build_transcriptome_index(options["job_id"], length="long") elif job_type is ProcessorPipeline.AGILENT_TWOCOLOR_TO_PCL: from data_refinery_workers.processors.agilent_twocolor import agilent_twocolor_to_pcl agilent_twocolor_to_pcl(options["job_id"]) elif job_type is ProcessorPipeline.ILLUMINA_TO_PCL: from data_refinery_workers.processors.illumina import illumina_to_pcl illumina_to_pcl(options["job_id"]) elif job_type is ProcessorPipeline.SALMON: from data_refinery_workers.processors.salmon import salmon salmon(options["job_id"]) elif job_type is ProcessorPipeline.SMASHER: from data_refinery_workers.processors.smasher import smash smash(options["job_id"]) elif job_type is ProcessorPipeline.NO_OP: from data_refinery_workers.processors.no_op import no_op_processor no_op_processor(options["job_id"]) elif job_type is ProcessorPipeline.JANITOR: from data_refinery_workers.processors.janitor import run_janitor run_janitor(options["job_id"]) elif job_type is ProcessorPipeline.QN_REFERENCE: from data_refinery_workers.processors import qn_reference qn_reference.create_qn_reference(options["job_id"]) else: logger.error( ("A valid job name was specified for job %s with id %d but " "no processor function is known to run it."), options["job_name"], options["job_id"]) sys.exit(1) sys.exit(0)
def create_qn_target(organism, platform, create_results=True): sample_codes_results = Sample.processed_objects.filter( platform_accession_code=platform, has_raw=True, technology="MICROARRAY", organism=organism, is_processed=True, ).values("accession_code") sample_codes = [res["accession_code"] for res in sample_codes_results] dataset = Dataset() dataset.data = {organism.name + "_(" + platform + ")": sample_codes} dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False dataset.save() job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() return qn_reference.create_qn_reference(job.pk, create_results=create_results)
def handle(self, *args, **options): """ """ if not options["job_id"]: if options["organism"] is None and not options["all"]: logger.error("You must specify an organism or --all") sys.exit(1) if options["organism"] and (options.get("organism", "") != "ALL"): organisms = [Organism.get_object_for_name(options["organism"].upper())] else: organisms = Organism.objects.all() for organism in organisms: if not organism_can_have_qn_target(organism, options["min"]): logger.error( "Organism does not have any platform with enough samples to generate a qn target", organism=organism, min=options["min"], ) continue if options["platform"] is None: biggest_platform = get_biggest_platform(organism) if biggest_platform is None: logger.error("No processed samples for organism.", organism=organism) continue else: biggest_platform = options["platform"] final_context = create_qn_target(organism, platform=biggest_platform) if final_context["success"]: print(":D") self.stdout.write("Target file: " + final_context["target_file"]) self.stdout.write( "Target S3: " + str(final_context["computed_files"][0].get_s3_url()) ) else: print(":(") else: qn_reference.create_qn_reference(options["job_id"])
def test_qn_reference(self): job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606) homo_sapiens.save() experiment = Experiment() experiment.accession_code = "12345" experiment.save() # We don't have a 0.tsv codes = [str(i) for i in range(1, 201)] for code in codes: sample = Sample() sample.accession_code = code sample.title = code sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS" sample.organism = homo_sapiens sample.technology = "MICROARRAY" sample.is_processed = True sample.save() cr = ComputationalResult() cr.save() computed_file = ComputedFile() computed_file.filename = code + ".tsv" computed_file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv" computed_file.size_in_bytes = int(code) computed_file.result = cr computed_file.is_smashable = True computed_file.save() scfa = SampleComputedFileAssociation() scfa.sample = sample scfa.computed_file = computed_file scfa.save() exsa = ExperimentSampleAssociation() exsa.experiment = experiment exsa.sample = sample exsa.save() dataset = Dataset() dataset.data = {"12345": ["1", "2", "3", "4", "5", "6"]} dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False # We don't QN because we're creating the target now dataset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() final_context = qn_reference.create_qn_reference(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["target_file"])) self.assertEqual(os.path.getsize(final_context["target_file"]), 562) homo_sapiens.refresh_from_db() target = homo_sapiens.qn_target.computedfile_set.latest() self.assertEqual(target.sha1, "de69d348f8b239479e2330d596c4013a7b0b2b6a") # Create and run a smasher job that will use the QN target we just made. pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() ds = Dataset() ds.data = {"12345": ["1", "2", "3", "4", "5"]} ds.aggregate_by = "SPECIES" ds.scale_by = "STANDARD" ds.email_address = "*****@*****.**" ds.quantile_normalize = True ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(final_context["success"]) np.testing.assert_almost_equal(final_context["merged_qn"]["1"][0], -0.4379488527774811) np.testing.assert_almost_equal(final_context["original_merged"]["1"][0], -0.5762109)
def test_qn_reference(self): # We don't have a 0.tsv experiment = prepare_experiment(range(1, 201)) job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() dataset = Dataset() dataset.data = {"12345": ["1", "2", "3", "4", "5", "6"]} dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False # We don't QN because we're creating the target now dataset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() final_context = qn_reference.create_qn_reference(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["target_file"])) self.assertEqual(os.path.getsize(final_context["target_file"]), 562) homo_sapiens = Organism.objects.get(taxonomy_id=9606) target = homo_sapiens.qn_target.computedfile_set.latest() self.assertEqual(target.sha1, "de69d348f8b239479e2330d596c4013a7b0b2b6a") # Create and run a smasher job that will use the QN target we just made. pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() ds = Dataset() ds.data = {"12345": ["1", "2", "3", "4", "5"]} ds.aggregate_by = "SPECIES" ds.scale_by = "STANDARD" ds.email_address = "*****@*****.**" ds.quantile_normalize = True ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(final_context["success"]) np.testing.assert_almost_equal(final_context["merged_qn"]["1"][0], -0.4379488527774811) np.testing.assert_almost_equal( final_context["original_merged"]["1"][0], -0.5762109) # Make sure that the results were created. We create 200 computed files # and computational results (1 for each sample) plus the one generated # by the QN reference processor. self.assertEqual(ComputedFile.objects.all().count(), 200 + 1) self.assertEqual(ComputationalResult.objects.all().count(), 200 + 1) self.assertEqual(ComputationalResultAnnotation.objects.all().count(), 1)
def handle(self, *args, **options): """ """ if not options["job_id"]: if options["organism"] is None and not options["all"]: logger.error("You must specify an organism or --all") sys.exit(1) if options["organism"] and (options.get("organism", "") != "ALL"): organisms = [ Organism.get_object_for_name(options["organism"].upper()) ] else: organisms = Organism.objects.all() for organism in organisms: if not organism_can_have_qn_target(organism): logger.error( "Organism does not have any platform with enough samples to generate a qn target", organism=organism, min=options["min"], ) continue samples = organism.sample_set.filter(has_raw=True, technology="MICROARRAY", is_processed=True) if samples.count() == 0: logger.error( "No processed samples for organism.", organism=organism, count=samples.count(), ) continue if options["platform"] is None: platform_counts = ( samples.values("platform_accession_code").annotate( dcount=Count("platform_accession_code")).order_by( "-dcount")) biggest_platform = platform_counts[0][ "platform_accession_code"] else: biggest_platform = options["platform"] sample_codes_results = Sample.processed_objects.filter( platform_accession_code=biggest_platform, has_raw=True, technology="MICROARRAY", organism=organism, is_processed=True, ).values("accession_code") sample_codes = [ res["accession_code"] for res in sample_codes_results ] dataset = Dataset() dataset.data = { organism.name + "_(" + biggest_platform + ")": sample_codes } dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False dataset.save() job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() final_context = qn_reference.create_qn_reference(job.pk) if final_context["success"]: print(":D") self.stdout.write("Target file: " + final_context["target_file"]) self.stdout.write( "Target S3: " + str(final_context["computed_files"][0].get_s3_url())) else: print(":(") else: qn_reference.create_qn_reference(options["job_id"])
def test_qn_reference(self): job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") experiment = Experiment() experiment.accession_code = "12345" experiment.save() for code in ['1', '2', '3', '4', '5', '6']: sample = Sample() sample.accession_code = code sample.title = code sample.platform_accession_code = 'A-MEXP-1171' sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS" sample.organism = homo_sapiens sample.technology = "MICROARRAY" sample.is_processed = True sample.save() cr = ComputationalResult() cr.save() file = ComputedFile() file.filename = code + ".tsv" file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv" file.size_in_bytes = int(code) file.result = cr file.is_smashable = True file.save() scfa = SampleComputedFileAssociation() scfa.sample = sample scfa.computed_file = file scfa.save() exsa = ExperimentSampleAssociation() exsa.experiment = experiment exsa.sample = sample exsa.save() dataset = Dataset() dataset.data = {"12345": ["1", "2", "3", "4", "5", "6"]} dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False # We don't QN because we're creating the target now dataset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() final_context = qn_reference.create_qn_reference(job.pk) self.assertTrue(final_context['success']) self.assertTrue(os.path.exists(final_context['target_file'])) self.assertEqual(os.path.getsize(final_context['target_file']), 556) target = utils.get_most_recent_qn_target_for_organism(homo_sapiens) self.assertEqual(target.sha1, '636d72d5cbf4b9785b0bd271a1430b615feaa7ea') ### # Smasher with QN ### pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() ds = Dataset() ds.data = {"12345": ["1", "2", "3", "4", "5"]} ds.aggregate_by = 'SPECIES' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = True ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(final_context['success']) self.assertEqual(final_context['merged_qn']['1'][0], -0.4379488528812934) self.assertEqual(final_context['original_merged']['1'][0], -0.576210936113982) ## # Test via management command ## from django.core.management import call_command from django.test import TestCase from django.utils.six import StringIO out = StringIO() try: call_command('create_qn_target', organism='homo_sapiens', min=1, stdout=out) except SystemExit as e: # this is okay! pass stdout = out.getvalue() self.assertTrue('Target file' in stdout) path = stdout.split('\n')[0].split(':')[1].strip() self.assertTrue(os.path.exists(path)) self.assertEqual(path, utils.get_most_recent_qn_target_for_organism(homo_sapiens).absolute_file_path)
def handle(self, *args, **options): """ """ if options["organism"] is None and not options["all"]: logger.error("You must specify an organism or --all") sys.exit(1) if options["organism"] and (options.get('organism', '') != "ALL"): organisms = [ Organism.get_object_for_name(options["organism"].upper()) ] else: organisms = Organism.objects.all() for organism in organisms: samples = Sample.processed_objects.filter(organism=organism, has_raw=True, technology="MICROARRAY", is_processed=True) if samples.count() == 0: logger.error("No processed samples for organism.", organism=organism, count=samples.count()) continue if samples.count() < options['min']: logger.error( "Proccessed samples don't meet minimum threshhold", organism=organism, count=samples.count(), min=options["min"]) continue if options["platform"] is None: platform_counts = samples.values( 'platform_accession_code').annotate(dcount=Count( 'platform_accession_code')).order_by('-dcount') biggest_platform = platform_counts[0][ 'platform_accession_code'] else: biggest_platform = options["platform"] sample_codes_results = Sample.processed_objects.filter( platform_accession_code=biggest_platform, has_raw=True, technology="MICROARRAY", is_processed=True).values('accession_code') sample_codes = [ res['accession_code'] for res in sample_codes_results ] dataset = Dataset() dataset.data = { organism.name + '_(' + biggest_platform + ')': sample_codes } dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False dataset.save() job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() final_context = qn_reference.create_qn_reference(job.pk) if final_context['success']: print(":D") self.stdout.write("Target file: " + final_context['target_file']) self.stdout.write( "Target S3: " + str(final_context['computed_files'][0].get_s3_url())) else: print(":(")