def prepare_job(): # Create 10 job directories for i in range(JOBS): os.makedirs(LOCAL_ROOT_DIR + "/processor_job_" + str(i), exist_ok=True) # These live on prod volumes at locations such as: # /var/ebs/SRP057116/SRR1972985/SRR1972985.sra os.makedirs(LOCAL_ROOT_DIR + "/SRP" + str(i), exist_ok=True) os.makedirs(LOCAL_ROOT_DIR + "/SRP" + str(i) + "/SRR" + str(i), exist_ok=True) sample = Sample() sample.accession_code = "SRR" + str(i) sample.save() cr = ComputationalResult() cr.save() cf = ComputedFile() cf.result = cr cf.size_in_bytes = 666 cf.save() scfa = SampleComputedFileAssociation() scfa.sample = sample scfa.computed_file = cf scfa.save() # Create a job out of the range with index in it to make sure we # don't delete index directories since that's where transcriptome # indices get downloaded to. os.makedirs(LOCAL_ROOT_DIR + "/processor_job_" + str(JOBS + 1) + "_index", exist_ok=True) os.makedirs(LOCAL_ROOT_DIR + "/SRP" + str(JOBS + 1) + "/SRR" + str(JOBS + 1), exist_ok=True) sample = Sample() sample.accession_code = "SRR" + str(JOBS + 1) sample.save() # Save two jobs so that we trigger two special circumstances, one # where the job is still running and the other where the job isn't # in Batch anymore. pj = ProcessorJob() pj.pipeline_applied = "SALMON" pj.batch_job_id = "running_job" pj.save() pj = ProcessorJob() pj.pipeline_applied = "SALMON" pj.batch_job_id = "missing_job" pj.save() pj = ProcessorJob() pj.pipeline_applied = "JANITOR" pj.save() return pj
def test_processed_samples_only(self): """ Don't return unprocessed samples """ experiment = Experiment() experiment.accession_code = "GSX12345" experiment.is_public = True experiment.save() sample = Sample() sample.title = "I am unprocessed" sample.accession_code = "GSXUnprocessed" sample.is_processed = False sample.save() experiment_sample_association = ExperimentSampleAssociation() experiment_sample_association.sample = sample experiment_sample_association.experiment = experiment experiment_sample_association.save() # we return all experiments response = self.client.get( reverse("search", kwargs={"version": API_VERSION}), {"search": "GSX12345"}) self.assertEqual(response.json()["count"], 1) # check requesting only experiments with processed samples response = self.client.get( reverse("search", kwargs={"version": API_VERSION}), { "search": "GSX12345", "num_processed_samples__gt": 0 }, ) self.assertEqual(response.json()["count"], 0) sample2 = Sample() sample2.title = "I am processed" sample2.accession_code = "GSXProcessed" sample2.is_processed = True sample2.save() experiment_sample2_association = ExperimentSampleAssociation() experiment_sample2_association.sample = sample2 experiment_sample2_association.experiment = experiment experiment_sample2_association.save() # update cached values experiment.num_total_samples = 2 experiment.num_processed_samples = 1 experiment.save() response = self.client.get( reverse("search", kwargs={"version": API_VERSION}), {"search": "GSX12345"}) self.assertEqual(response.json()["count"], 1) self.assertEqual(len(experiment.processed_samples), 1) experiment.delete() sample.delete() sample2.delete()
def test_management_command(self): sample = Sample() sample.accession_code = "DRR000897" sample.technology = "RNA-SEQ" sample.source_database = "SRA" sample.title = "Not important" sample.save() sample2 = Sample() sample2.accession_code = "DRR001173" sample2.technology = "RNA-SEQ" sample2.source_database = "SRA" sample2.title = "Not important" sample2.save() command = Command() command.handle(file=TEST_KEYWORDS, source_name="refinebio_tests", methods_url="ccdatalab.org") # If you look below you'll only see 14, but this is because DRR001173 # has two pairs of terms from different ontologies with the same # human-readable name self.assertEqual(SampleKeyword.objects.all().count(), 16) # I checked all of these manually in the Ontology Lookup Service, the # data itself comes from MetaSRA self.assertEqual( set( sample.keywords.values_list("name__human_readable_name", flat=True)), set([ "late embryonic stage", "serum", "late embryo", "cultured cell" ]), ) self.assertEqual( set( sample2.keywords.values_list("name__human_readable_name", flat=True)), set([ "epithelial neoplasm", "cancer", "neoplasm", "bladder disease", "bladder carcinoma", "carcinoma", "disease", "disease of cellular proliferation", # EFO:0000322 is now obsolete according to the OLS # https://www.ebi.ac.uk/ols/ontologies/efo/terms?iri=http%3A%2F%2Fwww.ebi.ac.uk%2Fefo%2FEFO_0000322 # so its human-readable name gets prefixed by them with "obsolete_" "obsolete_cell line", "cultured cell", ]), )
def prepare_illumina_job(organism): pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427%5Fnon%2Dnormalized%2Etxt.gz" og_file.filename = "GSE22427_non-normalized.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/ILLUMINA/GSE22427_non-normalized.txt") og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() sample_names = [ "LV-C&si-Control-1", "LV-C&si-Control-2", "LV-C&si-Control-3", "LV-C&si-EZH2-1", "LV-C&si-EZH2-2", "LV-C&si-EZH2-3", "LV-EZH2&si-EZH2-1", "LV-EZH2&si-EZH2-2", "LV-EZH2&si-EZH2-3", "LV-T350A&si-EZH2-1", "LV-T350A&si-EZH2-2", "LV-T350A&si-EZH2-3", ] for name in sample_names: sample = Sample() sample.accession_code = name sample.title = name sample.organism = organism sample.save() sa = SampleAnnotation() sa.sample = sample sa.data = {"description": [name]} sa.is_ccdl = False sa.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() sample = Sample.objects.get(title="LV-T350A&si-EZH2-3") sample.title = "ignoreme_for_description" sample.accession_code = "ignoreme_for_description" sample.save() return pj
def setUp(self): experiment = Experiment() experiment.accession_code = "GSE000" experiment.alternate_accession_code = "E-GEOD-000" experiment.title = "NONONONO" experiment.description = "Boooooourns. Wasabi." experiment.technology = "RNA-SEQ" experiment.save() self.experiment = experiment # Create some samples to attach keywords to sample = Sample() sample.accession_code = "SRR123" sample.technology = "RNA-SEQ" sample.source_database = "SRA" sample.title = "Not important" sample.save() experiment_sample_association = ExperimentSampleAssociation() experiment_sample_association.sample = sample experiment_sample_association.experiment = experiment experiment_sample_association.save() sample2 = Sample() sample2.accession_code = "SRR456" sample2.technology = "RNA-SEQ" sample2.source_database = "SRA" sample2.title = "Not important" sample2.save() experiment_sample_association = ExperimentSampleAssociation() experiment_sample_association.sample = sample2 experiment_sample_association.experiment = experiment experiment_sample_association.save() # Create the ontology terms I'm using in the tests name = OntologyTerm() name.ontology_term = "PATO:0000122" name.human_readable_name = "length" name.save() unit = OntologyTerm() unit.ontology_term = "UO:0010012" unit.human_readable_name = "thou" unit.save() contribution = Contribution() contribution.source_name = "refinebio_tests" contribution.methods_url = "ccdatalab.org" contribution.save() self.contribution = contribution
def test_sra_experiment_missing_alternate_accession(self): """Tests that an SRA experiment has its missing alternate_accession_code added.""" # 1. Create an experiment without an alternate_accession_code experiment = Experiment() experiment.accession_code = "SRP094947" experiment.source_database = "SRA" experiment.title = "Not important" experiment.save() # 2. We need to add a sample because the way that the SRA surveyor finds metadata is # through run accessions sample = Sample() sample.accession_code = "SRR5099111" sample.technology = "RNA-SEQ" sample.source_database = "SRA" sample.title = "Not important" sample.save() ExperimentSampleAssociation.objects.get_or_create( experiment=experiment, sample=sample) # 3. Setup is done, actually run the command. command = Command() command.handle() # 4. Refresh the experiment experiment.refresh_from_db() # Test that the correct alternate_accession_code was added self.assertEquals(experiment.alternate_accession_code, "GSE92260")
def test_download_file(self): dlj = DownloaderJob() dlj.accession_code = "ERR036" dlj.save() og = OriginalFile() og.source_filename = "ERR036000.fastq.gz" og.source_url = "ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz" og.is_archive = True og.save() sample = Sample() sample.accession_code = "ERR036000" sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() result, downloaded_files = sra.download_sra(dlj.pk) self.assertTrue(result) self.assertEqual(downloaded_files[0].sha1, "1dfe5460a4101fe87feeffec0cb2e053f6695961") self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
def test_download_file(self, mock_send_job): mock_send_job.return_value = None dlj = DownloaderJob() dlj.accession_code = "ERR036" dlj.save() og = OriginalFile() og.source_filename = "ERR036000.fastq.gz" og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz" og.is_archive = True og.save() sample = Sample() sample.accession_code = 'ERR036000' sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() success = sra.download_sra(dlj.pk)
def test_download_file_ncbi(self, mock_send_job): mock_send_job.return_value = None dlj = DownloaderJob() dlj.accession_code = "DRR002116" dlj.save() og = OriginalFile() og.source_filename = "DRR002116.sra" og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/DRR/DRR002/DRR002116/DRR002116.sra" og.is_archive = True og.save() sample = Sample() sample.accession_code = 'DRR002116' sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() result, downloaded_files = sra.download_sra(dlj.pk) utils.end_downloader_job(dlj, result) self.assertTrue(result) self.assertEqual(downloaded_files[0].sha1, 'd5374e7fe047d4f76b165c3f5148ab2df9d42cea') self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
def test_download_file_ncbi(self): dlj = DownloaderJob() dlj.accession_code = "SRR9117853" dlj.save() og = OriginalFile() og.source_filename = "SRR9117853.sra" og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/SRR/SRR9117/SRR9117853/SRR9117853.sra" og.is_archive = True og.save() sample = Sample() sample.accession_code = "SRR9117853" sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() result, downloaded_files = sra.download_sra(dlj.pk) utils.end_downloader_job(dlj, result) self.assertTrue(result) self.assertEqual(downloaded_files[0].sha1, "e7ad484fe6f134ba7d1b2664e58cc15ae5a958cc") self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
def test_get_sample_keywords(self): experiment = Experiment() experiment.save() sample = Sample() sample.title = "123" sample.accession_code = "123" sample.age = 23 sample.save() experiment_sample_association = ExperimentSampleAssociation() experiment_sample_association.sample = sample experiment_sample_association.experiment = experiment experiment_sample_association.save() length = OntologyTerm() length.ontology_term = "EFO:0002939" length.human_readable_name = "medulloblastoma" length.save() sk = SampleKeyword() sk.name = length sk.source, _ = Contribution.objects.get_or_create( source_name="Refinebio Tests", methods_url="ccdatalab.org") sk.sample = sample sk.save() self.assertEqual(set(experiment.get_sample_keywords()), set(["medulloblastoma"]))
def test_management_command(self): sample = Sample() sample.accession_code = "DRR001173" sample.technology = "RNA-SEQ" sample.source_database = "SRA" sample.title = "Not important" sample.save() command = Command() command.handle(file=TEST_METADATA, source_name="refinebio_tests", methods_url="ccdatalab.org") self.assertEqual(SampleAttribute.objects.all().count(), 1) metadata = sample.to_metadata_dict() self.assertIsNotNone(metadata.get("other_metadata", None)) self.assertEqual(len(metadata["other_metadata"]), 1) # Make sure everything matches what was in TEST_METADATA self.assertEqual(metadata["other_metadata"][0]["name"]["term"], "PATO:0000047") self.assertEqual(metadata["other_metadata"][0]["name"]["name"], "biological sex") self.assertEqual(metadata["other_metadata"][0]["value"]["term"], "PATO:0000383") self.assertEqual(metadata["other_metadata"][0]["value"]["name"], "female") self.assertAlmostEqual(metadata["other_metadata"][0]["probability"], 0.7856624891880539)
def test_download_file_swapper(self, mock_send_job): mock_send_job.return_value = None dlj = DownloaderJob() dlj.accession_code = "DRR002116" dlj.save() og = OriginalFile() og.source_filename = "DRR002116.sra" og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/DRR/DRR002/DRR002116/DRR002116.sra" og.is_archive = True og.save() sample = Sample() sample.accession_code = 'DRR002116' sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() result = sra._download_file(og.source_url, dlj, "/tmp", force_ftp=False) self.assertTrue(result)
def prepare_dotsra_job(filename="ERR1562482.sra"): pj = ProcessorJob() pj.pipeline_applied = "SALMON" pj.id = random.randint(111, 999999) pj.save() c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") samp = Sample() samp.accession_code = "SALMON" # So the test files go to the right place samp.organism = c_elegans samp.save() prepare_organism_indices() og_file = OriginalFile() og_file.source_filename = filename og_file.filename = filename og_file.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/" + filename og_file.save() og_file_samp_assoc = OriginalFileSampleAssociation() og_file_samp_assoc.original_file = og_file og_file_samp_assoc.sample = samp og_file_samp_assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() return pj, [og_file]
def test_qn_management_command(self): """Test that the management command fires off and then does not create a job for an organism that does not have enough samples on the same platform.""" homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606) homo_sapiens.save() experiment = Experiment() experiment.accession_code = "12345" experiment.save() codes = ["1", "2", "3", "4", "5", "6"] # We don't have a 0.tsv for code in codes: sample = Sample() sample.accession_code = code sample.title = code sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS" sample.organism = homo_sapiens sample.technology = "MICROARRAY" sample.is_processed = True sample.save() cr = ComputationalResult() cr.save() computed_file = ComputedFile() computed_file.filename = code + ".tsv" computed_file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv" computed_file.size_in_bytes = int(code) computed_file.result = cr computed_file.is_smashable = True computed_file.save() scfa = SampleComputedFileAssociation() scfa.sample = sample scfa.computed_file = computed_file scfa.save() exsa = ExperimentSampleAssociation() exsa.experiment = experiment exsa.sample = sample exsa.save() out = StringIO() try: call_command("create_qn_target", organism="homo_sapiens", min=1, stdout=out) except SystemExit as e: # this is okay! pass stdout = out.getvalue() self.assertFalse("Target file" in stdout) # There's not enough samples available in this scenario so we # shouldn't have even made a processor job. self.assertEqual(ProcessorJob.objects.count(), 0)
def make_test_data(organism): experiment = Experiment() experiment.accession_code = "GSE51088" experiment.technology = "RNA-SEQ" experiment.save() xoa = ExperimentOrganismAssociation() xoa.experiment = experiment xoa.organism = organism xoa.save() result = ComputationalResult() result.save() sample = Sample() sample.accession_code = "GSM1237818" sample.title = "GSM1237818" sample.organism = organism sample.technology = "RNA-SEQ" sample.is_processed = True sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.s3_key = "smasher-test-quant.sf" computed_file.s3_bucket = "data-refinery-test-assets" computed_file.filename = "quant.sf" computed_file.absolute_file_path = "/home/user/data_store/QUANT/smasher-test-quant.sf" computed_file.result = result computed_file.is_smashable = True computed_file.size_in_bytes = 123123 computed_file.sha1 = ( "08c7ea90b66b52f7cd9d9a569717a1f5f3874967" # this matches with the downloaded file ) computed_file.save() computed_file = ComputedFile() computed_file.filename = "logquant.tsv" computed_file.is_smashable = True computed_file.size_in_bytes = 123123 computed_file.result = result computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save()
def test_qn_reference(self, mock_send_job): organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606) organism.save() experiment = Experiment() experiment.accession_code = "12345" experiment.save() for code in [str(i) for i in range(1, 401)]: sample = Sample() sample.accession_code = code sample.title = code sample.platform_name = f"Affymetrix {organism.name}" sample.platform_accession_code = f"A-MEXP-{organism.name}" sample.manufacturer = "AFFYMETRIX" sample.organism = organism sample.technology = "MICROARRAY" sample.is_processed = True sample.has_raw = True sample.save() cr = ComputationalResult() cr.save() computed_file = ComputedFile() computed_file.filename = code + ".tsv" computed_file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv" computed_file.size_in_bytes = int(code) computed_file.result = cr computed_file.is_smashable = True computed_file.save() scfa = SampleComputedFileAssociation() scfa.sample = sample scfa.computed_file = computed_file scfa.save() exsa = ExperimentSampleAssociation() exsa.experiment = experiment exsa.sample = sample exsa.save() # We need more than one organism for the tests, but can't # repeat accesion codes, so halfway through just change the organism. if int(code) == 200: organism = Organism(name="MUS_MUSCULUS", taxonomy_id=111) organism.save() # Setup is done, actually run the command. command = Command() command.handle(organisms="HOMO_SAPIENS,MUS_MUSCULUS") self.assertEqual(len(mock_send_job.mock_calls), 2) self.assertEqual(ProcessorJob.objects.count(), 2)
def test_fail(self): """ Test our ability to fail """ result = ComputationalResult() result.save() sample = Sample() sample.accession_code = 'XXX' sample.title = 'XXX' sample.organism = Organism.get_object_for_name("HOMO_SAPIENS") sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "NOT_REAL.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['XXX']} ds.aggregate_by = 'EXPERIMENT' ds.scale_by = 'MINMAX' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() dsid = ds.id job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() final_context = smasher.smash(job.pk, upload=False) ds = Dataset.objects.get(id=dsid) print(ds.failure_reason) print(final_context['dataset'].failure_reason) self.assertNotEqual(final_context['unsmashable_files'], [])
def prepare_illumina_job(job_info: Dict) -> ProcessorJob: pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = job_info["source_filename"] og_file.filename = job_info["filename"] og_file.absolute_file_path = job_info["absolute_file_path"] og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() for s in job_info["samples"]: # For convenience, if you give a list of strings we'll just use the # strings as both titles and accessions. annotation = None if type(s) == str: accession_code = s title = s elif type(s) == tuple and list(map(type, s)) == [str, str]: accession_code, title = s elif type(s) == tuple and list(map(type, s)) == [str, str, dict]: accession_code, title, annotation = s else: raise ValueError(f"Invalid sample type for sample {s}") sample = Sample() sample.accession_code = accession_code sample.title = title sample.organism = job_info["organism"] sample.save() sa = SampleAnnotation() sa.sample = sample sa.data = annotation if annotation is not None else { "description": [title] } sa.is_ccdl = False sa.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() return pj
def prepare_job(): pj = ProcessorJob() pj.pipeline_applied = "SALMON" pj.save() c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") samp = Sample() samp.accession_code = "SALMON" # So the test files go to the right place samp.organism = c_elegans samp.source_database = 'SRA' samp.technology = 'RNA-SEQ' samp.save() prepare_organism_indices() og_file = OriginalFile() og_file.source_filename = "ERR1562482_1.fastq.gz" og_file.filename = "ERR1562482_1.fastq.gz" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/ERR1562482_1.fastq.gz" og_file.is_downloaded = True og_file.save() og_file2 = OriginalFile() og_file2.source_filename = "ERR1562482_2.fastq.gz" og_file2.filename = "ERR1562482_2.fastq.gz" og_file2.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/ERR1562482_2.fastq.gz" og_file2.is_downloaded = True og_file2.save() og_file_samp_assoc = OriginalFileSampleAssociation() og_file_samp_assoc.original_file = og_file og_file_samp_assoc.sample = samp og_file_samp_assoc.save() og_file_samp_assoc2 = OriginalFileSampleAssociation() og_file_samp_assoc2.original_file = og_file2 og_file_samp_assoc2.sample = samp og_file_samp_assoc2.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file2 assoc1.processor_job = pj assoc1.save() return pj, [og_file, og_file2]
def test_good_detection(self): """GSE54661 appears to be mislabled (illuminaHumanv4) on GEO. Shows our detector works. """ from data_refinery_workers.processors import illumina pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE54nnn/GSE54661/suppl/GSE54661%5Fnon%5Fnormalized%2Etxt%2Egz" og_file.filename = "GSE54661_non_normalized.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/ILLUMINA/GSE54661_non_normalized.txt" ) og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() sample = Sample() sample.accession_code = "ABCD-1234" sample.title = "hypoxia_Signal" sample.organism = organism sample.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() final_context = illumina.illumina_to_pcl(pj.pk) self.assertEqual(final_context["platform"], "illuminaHumanv3") for key in final_context["samples"][0].sampleannotation_set.all( )[0].data.keys(): self.assertTrue(key in [ "detected_platform", "detection_percentage", "mapped_percentage" ]) # Cleanup after the job since it won't since we aren't running in cloud. shutil.rmtree(final_context["work_dir"], ignore_errors=True)
def test_convert_illumina_no_header(self): job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # ILMN_1885639 10.0000 0.7931 # ILMN_2209417 10.0000 0.2029 # ILMN_1765401 152.0873 0.0000 og_file = OriginalFile() og_file.source_filename = ( "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1.txt" ) og_file.filename = "GSM1089291-tbl-1.txt" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1.txt" og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() # To: # ENSG00000105675 10 # ENSG00000085721 152.0873 # ENSG00000278494 152.0873 final_context = no_op.no_op_processor(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["output_file_path"])) self.assertEqual(os.path.getsize(final_context["output_file_path"]), 786207)
def test_convert_processed_illumina(self): job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # Reporter Identifier VALUE Detection Pval # ILMN_1343291 14.943602 0 # ILMN_1343295 13.528082 0 og_file = OriginalFile() og_file.source_filename = "https://www.ebi.ac.uk/arrayexpress/experiments/E-GEOD-22433/" og_file.filename = "GSM557500_sample_table.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/NO_OP/GSM557500_sample_table.txt") og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() # To: # ENSG00000156508 14.943602 # ENSG00000111640 13.528082 final_context = no_op.no_op_processor(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["output_file_path"])) self.assertEqual(os.path.getsize(final_context["output_file_path"]), 920374) self.assertTrue( no_op.check_output_quality(final_context["output_file_path"]))
def prepare_job(length): pj = ProcessorJob() pj.pipeline_applied = "TRANSCRIPTOME_INDEX_" + length.upper() pj.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS", taxonomy_id=1001) samp = Sample() samp.organism = homo_sapiens samp.accession_code = "derp" + length samp.save() og_file = OriginalFile() og_file.source_filename = "aegilops_tauschii_" + length + ".fa.gz" og_file.filename = "aegilops_tauschii_" + length + ".fa.gz" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/AEGILOPS_TAUSCHII/aegilops_tauschii_short.fa.gz" og_file.is_downloaded = True og_file.save() og_file2 = OriginalFile() og_file2.source_filename = "aegilops_tauschii_" + length + ".gtf.gz" og_file2.filename = "aegilops_tauschii_" + length + ".gtf.gz" og_file2.absolute_file_path = "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/AEGILOPS_TAUSCHII/aegilops_tauschii_short.gtf.gz" og_file2.is_downloaded = True og_file2.save() og_file_samp_assoc = OriginalFileSampleAssociation() og_file_samp_assoc.original_file = og_file og_file_samp_assoc.sample = samp og_file_samp_assoc.save() og_file_samp_assoc2 = OriginalFileSampleAssociation() og_file_samp_assoc2.original_file = og_file2 og_file_samp_assoc2.sample = samp og_file_samp_assoc2.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() assoc2 = ProcessorJobOriginalFileAssociation() assoc2.original_file = og_file2 assoc2.processor_job = pj assoc2.save() return pj
def test_convert_illumina_bad_cols(self): """ In future, this test may be deprecated. For now it just alerts that it needs attention. """ job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # ILMN_1885639 10.0000 0.7931 11.0000 0.123 # ILMN_2209417 10.0000 0.2029 11.1234 0.543 # LMN_1765401 152.0873 0.0000 99.999 0.19 og_file = OriginalFile() og_file.source_filename = ( "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1-modified.txt" ) og_file.filename = "GSM1089291-tbl-1-modified.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1-modified.txt" ) og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() final_context = no_op.no_op_processor(job.pk) self.assertFalse(final_context["success"]) self.assertTrue("Tell Rich!" in final_context["job"].failure_reason)
def test_get_sample_metadata_fields_none(self): experiment = Experiment() experiment.save() sample = Sample() sample.title = "123" sample.accession_code = "123" sample.save() experiment_sample_association = ExperimentSampleAssociation() experiment_sample_association.sample = sample experiment_sample_association.experiment = experiment experiment_sample_association.save() self.assertEqual(experiment.get_sample_metadata_fields(), [])
def test_get_sample_metadata_fields(self): experiment = Experiment() experiment.save() sample = Sample() sample.title = "123" sample.accession_code = "123" sample.specimen_part = "Lung" sample.sex = "Male" sample.save() experiment_sample_association = ExperimentSampleAssociation() experiment_sample_association.sample = sample experiment_sample_association.experiment = experiment experiment_sample_association.save() self.assertEqual(set(experiment.get_sample_metadata_fields()), set(['specimen_part', 'sex']))
def test_download_file_unmated_reads(self): dlj = DownloaderJob() dlj.accession_code = "SRR1603661" dlj.save() og_1 = OriginalFile() og_1.source_filename = "SRR1603661_1.fastq.gz" og_1.source_url = "ftp.sra.ebi.ac.uk/vol1/fastq/SRR160/001/SRR1603661/SRR1603661_1.fastq.gz" og_1.expected_md5 = "502a9a482bfa5aa75865ccc0105ad13c" og_1.expected_size_in_bytes = 6751980628 og_1.is_archive = True og_1.save() og_2 = OriginalFile() og_2.source_filename = "SRR1603661_2.fastq.gz" og_2.source_url = "ftp.sra.ebi.ac.uk/vol1/fastq/SRR160/001/SRR1603661/SRR1603661_2.fastq.gz" og_1.expected_md5 = "fffd24457418d255991f54ec82a39d57" og_1.expected_size_in_bytes = 6949912932 og_2.is_archive = True og_2.save() sample = Sample() sample.accession_code = "SRR1603661" sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og_1 assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og_1 assoc.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og_2 assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og_2 assoc.save() result, downloaded_files = sra.download_sra(dlj.pk) utils.end_downloader_job(dlj, result) self.assertTrue(result) self.assertEqual(downloaded_files[0].sha1, "52bf22472069d04fa7767429f6ab78ebd10c0152") self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
def prepare_job(job_info: dict) -> ProcessorJob: job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() og_file = OriginalFile() og_file.source_filename = job_info["source_filename"] og_file.filename = job_info["filename"] og_file.absolute_file_path = job_info["absolute_file_path"] og_file.is_downloaded = True og_file.save() sample = Sample() sample.accession_code = job_info["accession_code"] sample.title = job_info["accession_code"] sample.platform_accession_code = job_info["platform_accession_code"] manufacturer = job_info.get("manufacturer", None) if manufacturer is not None: sample.manufacturer = manufacturer # The illumina samples need the human organism if manufacturer == "ILLUMINA": homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() return job
def test_get_results(self): """ Test our ability to collect the appropriate samples. """ sample = Sample() sample.accession_code = 'GSM45588' sample.save() result = ComputationalResult() result.save() computed_file1 = ComputedFile() computed_file1.filename = "oh_boy.txt" computed_file1.result = result computed_file1.size_in_bytes = 123 computed_file1.is_smashable = True computed_file1.save() computed_file2 = ComputedFile() computed_file2.filename = "gee_whiz.bmp" computed_file2.result = result computed_file2.size_in_bytes = 123 computed_file2.is_smashable = False computed_file2.save() assoc = SampleResultAssociation() assoc.sample = sample assoc.result = result assoc.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file1 assoc.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file2 assoc.save() computed_files = sample.get_result_files() self.assertEqual(computed_files.count(), 2)