def handle(self, *args, **options): if options["job_id"] is None: logger.error("You must specify a job ID.", job_id=options["job_id"]) sys.exit(1) try: job_type = ProcessorPipeline[options["job_name"]] except KeyError: logger.error("You must specify a valid job name.", job_name=options["job_name"], job_id=options["job_id"]) sys.exit(1) if job_type is ProcessorPipeline.AFFY_TO_PCL: from data_refinery_workers.processors.array_express import affy_to_pcl affy_to_pcl(options["job_id"]) elif job_type is ProcessorPipeline.TRANSCRIPTOME_INDEX_SHORT: from data_refinery_workers.processors.transcriptome_index import build_transcriptome_index build_transcriptome_index(options["job_id"], length="short") elif job_type is ProcessorPipeline.TRANSCRIPTOME_INDEX_LONG: from data_refinery_workers.processors.transcriptome_index import build_transcriptome_index build_transcriptome_index(options["job_id"], length="long") elif job_type is ProcessorPipeline.AGILENT_TWOCOLOR_TO_PCL: from data_refinery_workers.processors.agilent_twocolor import agilent_twocolor_to_pcl agilent_twocolor_to_pcl(options["job_id"]) elif job_type is ProcessorPipeline.ILLUMINA_TO_PCL: from data_refinery_workers.processors.illumina import illumina_to_pcl illumina_to_pcl(options["job_id"]) elif job_type is ProcessorPipeline.SALMON: from data_refinery_workers.processors.salmon import salmon salmon(options["job_id"]) elif job_type is ProcessorPipeline.SMASHER: from data_refinery_workers.processors.smasher import smash smash(options["job_id"]) elif job_type is ProcessorPipeline.NO_OP: from data_refinery_workers.processors.no_op import no_op_processor no_op_processor(options["job_id"]) elif job_type is ProcessorPipeline.JANITOR: from data_refinery_workers.processors.janitor import run_janitor run_janitor(options["job_id"]) elif job_type is ProcessorPipeline.QN_REFERENCE: from data_refinery_workers.processors import qn_reference qn_reference.create_qn_reference(options["job_id"]) else: logger.error( ("A valid job name was specified for job %s with id %d but " "no processor function is known to run it."), options["job_name"], options["job_id"]) sys.exit(1) sys.exit(0)
def test_illumina_no_pvalue(self): """This experiment should fail because it has no p-value columns, so make sure it fails at that stage of the processing""" organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() pj = prepare_illumina_job({ "source_filename": "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE41nnn/GSE41355/suppl/GSE41355%5Fnon%2Dnormalized%2Etxt%2Egz", "filename": "GSE41355_non-normalized.txt", "absolute_file_path": "/home/user/data_store/raw/TEST/ILLUMINA/GSE41355_non-normalized.txt", "organism": organism, "samples": [ ("GSM1015436", "IRF3/7 DKO 2"), ], }) final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False) self.assertFailed(pj, "Could not detect PValue column!")
def test_illumina_id_ref_column_with_whitespace(self): """This test case tests the issue brought up in https://github.com/alexslemonade/refinebio/issues/1560 where an ID_REF column would not be detected because the column name had a trailing space """ organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() pj = prepare_illumina_job({ "source_filename": "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100301/suppl/GSE100301%5Fnon%2Dnormalized%2Etxt%2Egz", "filename": "GSE100301_non-normalized.txt", "absolute_file_path": "/home/user/data_store/raw/TEST/ILLUMINA/GSE100301_non-normalized.txt", "organism": organism, "samples": [ ( "GSM2677583", "22Rv1-tetO-Gal4, replicate 1", { "description": ["SAMPLE 1"], }, ), ], }) final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False) self.assertSucceeded(pj)
def test_illumina_rows_starting_with_whitespace(self): organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() pj = prepare_illumina_job({ "source_filename": "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE112nnn/GSE112517/suppl/GSE112517_non-normalized.txt.gz", "filename": "GSE112517_non-normalized.txt", "absolute_file_path": "/home/user/data_store/raw/TEST/ILLUMINA/GSE112517_non-normalized.txt", "organism": organism, "samples": [ ( "GSM3071991", "MCF-7 KLHDC7B siRNA knockdown control", { "description": ["SAMPLE 1"], }, ), ( "GSM3071992", "MCF-7 KLHDC7B siRNA knockdown", { "description": ["SAMPLE 2"], }, ), ], }) final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False) self.assertSucceeded(pj)
def test_illumina_to_pcl(self): """Most basic Illumina to PCL test""" organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() job = prepare_illumina_job({**GSE22427, "organism": organism}) # Remove the title of one of the samples to make sure that we can still # find its detection column using the description given as an annotation sample = Sample.objects.get(title="LV-T350A&si-EZH2-3") sample.title = "ignoreme_for_description" sample.accession_code = "ignoreme_for_description" sample.save() final_context = illumina.illumina_to_pcl(job.pk, cleanup=False) self.assertSucceeded(job) for sample in final_context["samples"]: smashme = sample.get_most_recent_smashable_result_file() self.assertTrue(os.path.exists(smashme.absolute_file_path)) os.remove(smashme.absolute_file_path) # Cleanup after the job since it won't since we aren't running in cloud. shutil.rmtree(final_context["work_dir"], ignore_errors=True)
def test_bad_illumina_detection(self): """ With the wrong species, this will fail the platform detection threshold. """ from data_refinery_workers.processors import illumina job = prepare_illumina_job('RATTUS_NORVEGICUS') final_context = illumina.illumina_to_pcl(job.pk) self.assertTrue(final_context['abort']) # Cleanup after the job since it won't since we aren't running in cloud. shutil.rmtree(final_context["work_dir"], ignore_errors=True)
def test_illumina_to_pcl(self): """ Most basic Illumina to PCL test """ from data_refinery_workers.processors import illumina job = prepare_illumina_job() final_context = illumina.illumina_to_pcl(job.pk) for sample in final_context['samples']: smashme = sample.get_most_recent_smashable_result_file() self.assertTrue(os.path.exists(smashme.absolute_file_path)) os.remove(smashme.absolute_file_path) # Cleanup after the job since it won't since we aren't running in cloud. shutil.rmtree(final_context["work_dir"], ignore_errors=True)
def test_good_detection(self): """GSE54661 appears to be mislabled (illuminaHumanv4) on GEO. Shows our detector works. """ from data_refinery_workers.processors import illumina pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE54nnn/GSE54661/suppl/GSE54661%5Fnon%5Fnormalized%2Etxt%2Egz" og_file.filename = "GSE54661_non_normalized.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/ILLUMINA/GSE54661_non_normalized.txt" ) og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() sample = Sample() sample.accession_code = "ABCD-1234" sample.title = "hypoxia_Signal" sample.organism = organism sample.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() final_context = illumina.illumina_to_pcl(pj.pk) self.assertEqual(final_context["platform"], "illuminaHumanv3") for key in final_context["samples"][0].sampleannotation_set.all( )[0].data.keys(): self.assertTrue(key in [ "detected_platform", "detection_percentage", "mapped_percentage" ]) # Cleanup after the job since it won't since we aren't running in cloud. shutil.rmtree(final_context["work_dir"], ignore_errors=True)
def test_bad_illumina_detection(self): """With the wrong species, this will fail the platform detection threshold.""" organism = Organism(name="RATTUS_NORVEGICUS", taxonomy_id=9606, is_scientific_name=True) organism.save() job = prepare_illumina_job({**GSE22427, "organism": organism}) final_context = illumina.illumina_to_pcl(job.pk, cleanup=False) self.assertTrue(final_context["abort"]) # Cleanup after the job since it won't since we aren't running in cloud. shutil.rmtree(final_context["work_dir"], ignore_errors=True)
def test_illumina_to_pcl(self): """ Most basic Illumina to PCL test """ from data_refinery_workers.processors import illumina organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() job = prepare_illumina_job(organism) final_context = illumina.illumina_to_pcl(job.pk) for sample in final_context["samples"]: smashme = sample.get_most_recent_smashable_result_file() self.assertTrue(os.path.exists(smashme.absolute_file_path)) os.remove(smashme.absolute_file_path) # Cleanup after the job since it won't since we aren't running in cloud. shutil.rmtree(final_context["work_dir"], ignore_errors=True)
def test_illumina_quoted_row_names(self): organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() pj = prepare_illumina_job({ "source_filename": "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE33nnn/GSE33814/suppl/GSE33814%5Fnon%2Dnormalized%2Etxt%2Egz", # Some of the columns are trimmed to save space and time "filename": "GSE33814_trimmed_non-normalized.txt", "absolute_file_path": "/home/user/data_store/raw/TEST/ILLUMINA/GSE33814_trimmed_non-normalized.txt", "organism": organism, "samples": [ ("GSM836222", "IMGUS_32"), ("GSM836223", "IMGUS_33"), ], }) final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False) self.assertSucceeded(pj) # Make sure that the row names are no longer quoted after sanitizing the file def assertNotQuoted(string: str): string = string.strip() self.assertNotEqual(string[0], '"') self.assertNotEqual(string[-1], '"') with open(final_context["sanitized_file_path"], "r") as f: reader = csv.reader(f, delimiter="\t") headers = next(reader) for header in headers: assertNotQuoted(header) # Also make sure the probe IDs aren't qutoed first_row = next(reader) assertNotQuoted(first_row[0])
def test_illumina_space_separated(self): organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() pj = prepare_illumina_job({ "source_filename": "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE48nnn/GSE48023/suppl/GSE48023%5Fnon%2Dnormalized%2Etxt%2Egz", # Some of the columns are trimmed to save space and time "filename": "GSE48023_trimmed_non-normalized.txt", "absolute_file_path": "/home/user/data_store/raw/TEST/ILLUMINA/GSE48023_trimmed_non-normalized.txt", "organism": organism, "samples": [ ("GSM1165512", "WholeBloodRNA_IN0242_Day0"), ("GSM1165513", "WholeBloodRNA_IN0242_Day1"), ("GSM1165514", "WholeBloodRNA_IN0242_Day14"), ("GSM1165515", "WholeBloodRNA_IN0242_Day3"), ("GSM1165516", "WholeBloodRNA_IN0243_Day0"), ], }) final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False) self.assertSucceeded(pj) # Assert that the sanitized file is tab-separated (by reading it as a # TSV and making sure it has 11 headers) and has an extra ID_REF header with open(final_context["sanitized_file_path"], "r") as f: reader = csv.reader(f, delimiter="\t") headers = next(reader) # ID_REF + 5 observations + 5 p-values self.assertEqual(len(headers), 11) self.assertEqual(headers[0], "ID_REF")
def test_good_detection(self): """GSE54661 appears to be mislabled (illuminaHumanv4) on GEO. Shows our detector works.""" organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() pj = prepare_illumina_job({ "source_filename": "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE54nnn/GSE54661/suppl/GSE54661%5Fnon%5Fnormalized%2Etxt%2Egz", "filename": "GSE54661_non_normalized.txt", "absolute_file_path": "/home/user/data_store/raw/TEST/ILLUMINA/GSE54661_non_normalized.txt", "organism": organism, "samples": [("ABCD-1234", "CB CD34+ hypoxia"), ("ABCD-1235", "CB CD34+ normoxia")], }) final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False) self.assertSucceeded(pj) self.assertEqual(final_context["platform"], "illuminaHumanv3") for key in final_context["samples"][0].sampleannotation_set.all( )[0].data.keys(): self.assertTrue(key in [ "detected_platform", "detection_percentage", "mapped_percentage" ]) for sample in final_context["samples"]: smashme = sample.get_most_recent_smashable_result_file() self.assertTrue(os.path.exists(smashme.absolute_file_path)) # Cleanup after the job since it won't since we aren't running in cloud. shutil.rmtree(final_context["work_dir"], ignore_errors=True)
def test_illumina_latin1_input(self): """Test a latin1-encoded Illumina file. GSE106321 is encoded in latin1 and uses μ in the title of some columns, so preparing the file would cause a UnicodeParseError. Make sure that doesn't happen any more. """ organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() pj = prepare_illumina_job({ "source_filename": "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE106nnn/GSE106321/suppl/GSE106321_non-normalized.txt.gz", "filename": "GSE106321_non-normalized.txt", "absolute_file_path": "/home/user/data_store/raw/TEST/ILLUMINA/GSE106321_non-normalized.txt", "organism": organism, "samples": [ ( "GSM2835938", "A375 + 24h vem (3µM) 2", { "description": ["A375 + 24h vem (3µM) 2"] }, ), ( "GSM2835937", "A375 + 24h vem (3µM) 1", { "description": ["A375 + 24h vem (3µM) 1"] }, ), ( "GSM2835936", "A375 + 24h vem (3µM)", { "description": ["A375 + 24h vem (3µM)"] }, ), ("GSM2835935", "A375 + 24h DMSO 2", { "description": ["A375 + 24h DMSO 2"] }), ("GSM2835934", "A375+ 24h DMSO 1", { "description": ["A375+ 24h DMSO 1"] }), ("GSM2835933", "A375 + 24h DMSO", { "description": ["A375 + 24h DMSO"] }), ], }) final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False) # XXX: For now, this processor job fails, but we want to make sure that it fails in the right place # See https://github.com/AlexsLemonade/refinebio/issues/2870 for why it is failing. self.assertFailed( pj, "Encountered error in R code while running illumina.R pipeline during processing" ) # Make sure that the input is now utf-8 encoded and has the right headers. # Trying to open a latin1 file as utf-8 would cause an # exception to be thrown, so if opening succeeds we can assume the encoding succeeded. with open(final_context["sanitized_file_path"], "r", encoding="utf-8") as f: reader = csv.reader(f, delimiter="\t") # Check the headers to make sure that the mu was correctly re-encoded headers = next(reader) self.assertEqual( headers, [ "ID_REF", "A375 + 24h DMSO", "Detection Pval", "A375+ 24h DMSO 1", "Detection Pval", "A375 + 24h DMSO 2", "Detection Pval", "A375 + 24h vem (3µM)", "Detection Pval", "A375 + 24h vem (3µM) 1", "Detection Pval", "A375 + 24h vem (3µM) 2", "Detection Pval", "", "", "", "", "", "", ], )