def test_download_file_ncbi(self, mock_send_job): mock_send_job.return_value = None dlj = DownloaderJob() dlj.accession_code = "DRR002116" dlj.save() og = OriginalFile() og.source_filename = "DRR002116.sra" og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/DRR/DRR002/DRR002116/DRR002116.sra" og.is_archive = True og.save() sample = Sample() sample.accession_code = 'DRR002116' sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() result, downloaded_files = sra.download_sra(dlj.pk) utils.end_downloader_job(dlj, result) self.assertTrue(result) self.assertEqual(downloaded_files[0].sha1, 'd5374e7fe047d4f76b165c3f5148ab2df9d42cea') self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
def prepare_job(): pj = ProcessorJob() pj.pipeline_applied = "AFFY_TO_PCL" pj.save() original_file = OriginalFile() original_file.source_filename = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip" original_file.filename = "GSM1426071_CD_colon_active_1.CEL" original_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/CEL/GSM1426071_CD_colon_active_1.CEL") original_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = original_file assoc1.processor_job = pj assoc1.save() c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") sample = Sample() sample.title = "Heyo" sample.organism = c_elegans sample.is_processed = False sample.save() ogsa = OriginalFileSampleAssociation() ogsa.sample = sample ogsa.original_file = original_file ogsa.save() return pj
def _make_original_file_with_contents(contents: str) -> OriginalFile: _, path = tempfile.mkstemp(suffix=".txt") with open(path, "w") as f: f.write(contents) og_file = OriginalFile() og_file.source_filename = path og_file.filename = os.path.basename(path) og_file.absolute_file_path = os.path.realpath(path) og_file.is_downloaded = True og_file.save() return og_file
def prepare_illumina_job(job_info: Dict) -> ProcessorJob: pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = job_info["source_filename"] og_file.filename = job_info["filename"] og_file.absolute_file_path = job_info["absolute_file_path"] og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() for s in job_info["samples"]: # For convenience, if you give a list of strings we'll just use the # strings as both titles and accessions. annotation = None if type(s) == str: accession_code = s title = s elif type(s) == tuple and list(map(type, s)) == [str, str]: accession_code, title = s elif type(s) == tuple and list(map(type, s)) == [str, str, dict]: accession_code, title, annotation = s else: raise ValueError(f"Invalid sample type for sample {s}") sample = Sample() sample.accession_code = accession_code sample.title = title sample.organism = job_info["organism"] sample.save() sa = SampleAnnotation() sa.sample = sample sa.data = annotation if annotation is not None else { "description": [title] } sa.is_ccdl = False sa.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() return pj
def test_convert_processed_illumina(self): job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # Reporter Identifier VALUE Detection Pval # ILMN_1343291 14.943602 0 # ILMN_1343295 13.528082 0 og_file = OriginalFile() og_file.source_filename = "https://www.ebi.ac.uk/arrayexpress/experiments/E-GEOD-22433/" og_file.filename = "GSM557500_sample_table.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/NO_OP/GSM557500_sample_table.txt") og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() # To: # ENSG00000156508 14.943602 # ENSG00000111640 13.528082 final_context = no_op.no_op_processor(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["output_file_path"])) self.assertEqual(os.path.getsize(final_context["output_file_path"]), 920374) self.assertTrue( no_op.check_output_quality(final_context["output_file_path"]))
def test_good_detection(self): """GSE54661 appears to be mislabled (illuminaHumanv4) on GEO. Shows our detector works. """ from data_refinery_workers.processors import illumina pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE54nnn/GSE54661/suppl/GSE54661%5Fnon%5Fnormalized%2Etxt%2Egz" og_file.filename = "GSE54661_non_normalized.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/ILLUMINA/GSE54661_non_normalized.txt" ) og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() sample = Sample() sample.accession_code = "ABCD-1234" sample.title = "hypoxia_Signal" sample.organism = organism sample.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() final_context = illumina.illumina_to_pcl(pj.pk) self.assertEqual(final_context["platform"], "illuminaHumanv3") for key in final_context["samples"][0].sampleannotation_set.all( )[0].data.keys(): self.assertTrue(key in [ "detected_platform", "detection_percentage", "mapped_percentage" ]) # Cleanup after the job since it won't since we aren't running in cloud. shutil.rmtree(final_context["work_dir"], ignore_errors=True)
def test_convert_illumina_no_header(self): job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # ILMN_1885639 10.0000 0.7931 # ILMN_2209417 10.0000 0.2029 # ILMN_1765401 152.0873 0.0000 og_file = OriginalFile() og_file.source_filename = ( "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1.txt" ) og_file.filename = "GSM1089291-tbl-1.txt" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1.txt" og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() # To: # ENSG00000105675 10 # ENSG00000085721 152.0873 # ENSG00000278494 152.0873 final_context = no_op.no_op_processor(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["output_file_path"])) self.assertEqual(os.path.getsize(final_context["output_file_path"]), 786207)
def test_convert_illumina_bad_cols(self): """ In future, this test may be deprecated. For now it just alerts that it needs attention. """ job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() # ex: # ILMN_1885639 10.0000 0.7931 11.0000 0.123 # ILMN_2209417 10.0000 0.2029 11.1234 0.543 # LMN_1765401 152.0873 0.0000 99.999 0.19 og_file = OriginalFile() og_file.source_filename = ( "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1-modified.txt" ) og_file.filename = "GSM1089291-tbl-1-modified.txt" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1-modified.txt" ) og_file.is_downloaded = True og_file.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample = Sample() sample.accession_code = "GSM557500" sample.title = "GSM557500" sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "ILLUMINA" sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() final_context = no_op.no_op_processor(job.pk) self.assertFalse(final_context["success"]) self.assertTrue("Tell Rich!" in final_context["job"].failure_reason)
def prepare_illumina_job(species="H**o sapiens"): pj = ProcessorJob() pj.pipeline_applied = "ILLUMINA_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427%5Fnon%2Dnormalized%2Etxt.gz" og_file.filename = "GSE22427_non-normalized.txt" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/ILLUMINA/GSE22427_non-normalized.txt" og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() sample_names = [ "LV-C&si-Control-1", "LV-C&si-Control-2", "LV-C&si-Control-3", "LV-C&si-EZH2-1", "LV-C&si-EZH2-2", "LV-C&si-EZH2-3", "LV-EZH2&si-EZH2-1", "LV-EZH2&si-EZH2-2", "LV-EZH2&si-EZH2-3", "LV-T350A&si-EZH2-1", "LV-T350A&si-EZH2-2", "LV-T350A&si-EZH2-3" ] for name in sample_names: sample = Sample() sample.accession_code = name sample.title = name sample.organism = Organism.get_object_for_name(species) sample.save() sa = SampleAnnotation() sa.sample = sample sa.data = {'description': [name]} sa.is_ccdl = False sa.save() sample_assoc = OriginalFileSampleAssociation() sample_assoc.original_file = og_file sample_assoc.sample = sample sample_assoc.save() sample = Sample.objects.get(title="LV-T350A&si-EZH2-3") sample.title = "ignoreme_for_description" sample.accession_code = "ignoreme_for_description" sample.save() return pj
def test_assembly_information(self): og_file = OriginalFile() og_file.source_url = "ftp://ftp.ensemblgenomes.org/pub/release-39/plants/fasta/aegilops_tauschii/dna/Aegilops_tauschii.ASM34733v1.dna.toplevel.fa.gz" og_file.source_filename = "aegilops_tauschii_short.fa.gz" og_file2 = OriginalFile() og_file2.source_url = "ftp://ftp.ensemblgenomes.org/pub/release-39/plants/gtf/aegilops_tauschii/Aegilops_tauschii.ASM34733v1.39.gtf.gz" og_file2.source_filename = "aegilops_tauschii_short.gtf.gz" job_context = { "original_files": [og_file, og_file2], "computed_files": [] } job_context = transcriptome_index._extract_assembly_information( job_context) self.assertEqual("39", job_context["assembly_version"]) self.assertEqual("ASM34733v1", job_context["assembly_name"])
def prepare_agilent_twocolor_job(): pj = ProcessorJob() pj.pipeline_applied = "AGILENT_TWOCOLOR_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE22900&format=file" og_file.filename = "GSM466597_95899_agilent.txt" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/AGILENT_TWOCOLOR/GSM466597_95899_agilent.txt" og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() return pj
def prepare_non_ba_job(): pj = ProcessorJob() pj.pipeline_applied = "AFFY_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM45nnn/GSM45588/suppl/GSM45588.CEL.gz" og_file.filename = "GSM45588.CEL" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/CEL/GSM45588.CEL" og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() return pj
def prepare_ba_job(): pj = ProcessorJob() pj.pipeline_applied = "AFFY_TO_PCL" pj.save() og_file = OriginalFile() og_file.source_filename = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip" og_file.filename = "GSM1426071_CD_colon_active_1.CEL" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/CEL/GSM1426071_CD_colon_active_1.CEL" og_file.is_downloaded = True og_file.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() return pj
def test_download_file(self, mock_send_job): mock_send_job.return_value = None dlj = DownloaderJob() dlj.save() og = OriginalFile() og.source_filename = "Aegilops_tauschii.ASM34733v1.37.gtf.gz" og.source_url = self.gtf_download_url og.is_archive = True og.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() transcriptome_index.download_transcriptome(dlj.pk)
def prepare_job(job_info: dict) -> ProcessorJob: job = ProcessorJob() job.pipeline_applied = "NO_OP" job.save() og_file = OriginalFile() og_file.source_filename = job_info["source_filename"] og_file.filename = job_info["filename"] og_file.absolute_file_path = job_info["absolute_file_path"] og_file.is_downloaded = True og_file.save() sample = Sample() sample.accession_code = job_info["accession_code"] sample.title = job_info["accession_code"] sample.platform_accession_code = job_info["platform_accession_code"] manufacturer = job_info.get("manufacturer", None) if manufacturer is not None: sample.manufacturer = manufacturer # The illumina samples need the human organism if manufacturer == "ILLUMINA": homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) homo_sapiens.save() sample.organism = homo_sapiens sample.save() assoc = OriginalFileSampleAssociation() assoc.original_file = og_file assoc.sample = sample assoc.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = job assoc1.save() return job
def test_no_rnaseq(self): """Makes sure that no RNA-Seq data gets downloaded even if there's a job for it. """ dlj = DownloaderJob() dlj.accession_code = 'GSE103217' dlj.save() original_file = OriginalFile() original_file.filename = "GSE103217_family.xml.tgz" original_file.source_url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE103nnn/GSE103217/miniml/GSE103217_family.xml.tgz" original_file.source_filename = "GSE103217_family.xml.tgz" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj assoc.save() sample = Sample() sample.accession_code = 'GSE103217' sample.technology = "RNA-SEQ" sample.manufacturer = "ILLUMINA" sample.platform_accession_code = "Illumina HiSeq 2500" sample.save() og_assoc = OriginalFileSampleAssociation() og_assoc.sample = sample og_assoc.original_file = original_file og_assoc.save() download_result = geo.download_geo(dlj.id) self.assertFalse(download_result) dlj.refresh_from_db() self.assertFalse(dlj.success) # It's not necessarily that we didn't extract any files, but # none that were usable so it looks like none. self.assertEqual(dlj.failure_reason, "Failed to extract any downloaded files.")
def test_download_file_swapper(self): dlj = DownloaderJob() dlj.accession_code = "SRR9117853" dlj.save() og = OriginalFile() og.source_filename = "SRR9117853.sra" og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/SRR/SRR9117/SRR9117853/SRR9117853.sra" og.is_archive = True og.save() sample = Sample() sample.accession_code = "SRR9117853" sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() result = sra._download_file(og.source_url, dlj, "/tmp/doomed", force_ftp=False) self.assertTrue(result)
def _download_file(original_file: OriginalFile, downloader_job: DownloaderJob, target_file_path: str) -> bool: """ Download file dispatcher. Dispatches to the HTTP or Aspera downloader """ download_url = original_file.source_url # SRA files have Apsera downloads. if "ftp.sra.ebi.ac.uk" in download_url: # From: ftp.sra.ebi.ac.uk/vol1/fastq/SRR735/005/SRR7353755/SRR7353755_1.fastq.gz # To: [email protected]:/vol1/fastq/SRR735/005/SRR7353755/SRR7353755_1.fastq.gz download_url = download_url.replace("ftp", "era-fasp@fasp") download_url = download_url.replace(".uk/", ".uk:/") original_file.source_url = download_url return _download_file_aspera(download_url, downloader_job, target_file_path, 0, original_file, source="ENA") elif "ncbi.nlm.nih.gov" in download_url: # Try to convert old-style endpoints into new-style endpoints if possible try: if "anonftp" in download_url or "dbtest" in download_url: accession = download_url.split("/")[-1].split(".sra")[0] new_url = get_https_sra_download(accession) if new_url: download_url = new_url except Exception: pass return _download_file_http(download_url, downloader_job, target_file_path) else: downloader_job.failure_reason = ( "Unrecognized URL pattern: {}").format(download_url) return False return True
def test_queue_downloader_jobs_for_original_files(self, mock_send_task): """Make sure that queue_downloader_jobs queues all expected Downloader jobs for a given experiment. """ # First, create an experiment with two samples associated with it # and create two original files for each of those samples. experiment_object = Experiment() experiment_object.accession_code = "Experiment1" experiment_object.save() sample_object_1 = Sample() sample_object_1.accession_code = "Sample1" sample_object_1.platform_accession_code = "Illumina Genome Analyzer" sample_object_1.platform_accession_name = "Illumina Genome Analyzer" sample_object_1.technology = "RNA-SEQ" sample_object_1.manufacturer = "ILLUMINA" sample_object_1.source_database = "SRA" sample_object_1.save() sample_object_2 = Sample() sample_object_2.accession_code = "Sample2" sample_object_2.platform_accession_code = "Illumina Genome Analyzer" sample_object_2.platform_accession_name = "Illumina Genome Analyzer" sample_object_2.technology = "RNA-SEQ" sample_object_2.manufacturer = "ILLUMINA" sample_object_2.source_database = "SRA" sample_object_2.save() association = ExperimentSampleAssociation() association.experiment = experiment_object association.sample = sample_object_1 association.save() association = ExperimentSampleAssociation() association.experiment = experiment_object association.sample = sample_object_2 association.save() sample_1_original_files = [] sample_2_original_files = [] original_file = OriginalFile() original_file.source_url = "first_url" original_file.source_filename = "first_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_1_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_1 original_file_sample_association.save() original_file = OriginalFile() original_file.source_url = "second_url" original_file.source_filename = "second_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_2_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_1 original_file_sample_association.save() original_file = OriginalFile() original_file.source_url = "third_url" original_file.source_filename = "third_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_2_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_2 original_file_sample_association.save() original_file = OriginalFile() original_file.source_url = "fourth_url" original_file.source_filename = "fourth_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_2_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_2 original_file_sample_association.save() survey_job = SurveyJob(source_type="SRA") survey_job.save() surveyor = SraSurveyor(survey_job) surveyor.queue_downloader_job_for_original_files( sample_1_original_files, experiment_object.accession_code ) surveyor.queue_downloader_job_for_original_files( sample_2_original_files, experiment_object.accession_code ) self.assertEqual(DownloaderJob.objects.all().count(), 2)
def test_no_repeat_jobs(self): """Make sure that queue_downloader_jobs queues all expected Downloader jobs for a given experiment. """ # First, create an experiment with two samples associated with it # and create two original files for each of those samples. experiment_object = Experiment() experiment_object.accession_code = "Experiment1" experiment_object.save() sample_object = Sample() sample_object.accession_code = "Sample1" sample_object.platform_accession_code = "Illumina Genome Analyzer" sample_object.platform_accession_name = "Illumina Genome Analyzer" sample_object.technology = "RNA-SEQ" sample_object.manufacturer = "ILLUMINA" sample_object.source_database = "SRA" sample_object.save() original_file_1 = OriginalFile() original_file_1.source_url = "first_url" original_file_1.source_filename = "first_filename" original_file_1.is_downloaded = False original_file_1.has_raw = True original_file_1.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file_1 original_file_sample_association.sample = sample_object original_file_sample_association.save() original_file_2 = OriginalFile() original_file_2.source_url = "second_url" original_file_2.source_filename = "second_filename" original_file_2.is_downloaded = False original_file_2.has_raw = True original_file_2.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file_2 original_file_sample_association.sample = sample_object original_file_sample_association.save() dlj = DownloaderJob() dlj.save() DownloaderJobOriginalFileAssociation( downloader_job=dlj, original_file=original_file_1 ).save() DownloaderJobOriginalFileAssociation( downloader_job=dlj, original_file=original_file_2 ).save() survey_job = SurveyJob(source_type="SRA") survey_job.save() surveyor = SraSurveyor(survey_job) surveyor.queue_downloader_job_for_original_files( [original_file_1, original_file_2], experiment_object.accession_code ) # We made one DownloaderJob in this test, so # queue_downloader_job_for_original_files didn't have anything # to do, so there should still be only one: self.assertEqual(1, DownloaderJob.objects.all().count())
def test_organism_shepherd_command(self, mock_nomad, mock_send_job, mock_get_active_volumes): """Tests that the organism shepherd requeues jobs in the right order. The situation we're setting up is basically this: * There are two experiments. * One of them has 1/2 samples processed, the other 0/1 * One of them needs a DownloaderJob requeued and the other needs a ProcessorJob requued. And what we're going to test for is: * Both of the jobs that need to be requeued are requeued. * The experiment with a processed sample is requeued first because it has a higher completion percentage. """ # First, set up our mocks to prevent network calls. mock_send_job.return_value = True active_volumes = {"1", "2", "3"} mock_get_active_volumes.return_value = active_volumes def mock_init_nomad(host, port=0, timeout=0): ret_value = MagicMock() ret_value.jobs = MagicMock() ret_value.jobs.get_jobs = MagicMock() ret_value.jobs.get_jobs.side_effect = lambda: [] return ret_value mock_nomad.side_effect = mock_init_nomad zebrafish = Organism(name="DANIO_RERIO", taxonomy_id=1337, is_scientific_name=True) zebrafish.save() # Experiment that is 0% complete. zero_percent_experiment = Experiment(accession_code='ERP037000') zero_percent_experiment.technology = 'RNA-SEQ' zero_percent_experiment.save() organism_assoc = ExperimentOrganismAssociation.objects.create( organism=zebrafish, experiment=zero_percent_experiment) zero_percent = OriginalFile() zero_percent.filename = "ERR037001.fastq.gz" zero_percent.source_filename = "ERR037001.fastq.gz" zero_percent.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR037/ERR037001/ERR037001_1.fastq.gz" zero_percent.is_archive = True zero_percent.save() zero_percent_sample = Sample() zero_percent_sample.accession_code = 'ERR037001' zero_percent_sample.organism = zebrafish zero_percent_sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = zero_percent_sample assoc.original_file = zero_percent assoc.save() assoc = ExperimentSampleAssociation() assoc.sample = zero_percent_sample assoc.experiment = zero_percent_experiment assoc.save() # TODO: fix names of all the variables to be appropriate for this test case. zero_percent_dl_job = DownloaderJob() zero_percent_dl_job.accession_code = zero_percent_sample.accession_code zero_percent_dl_job.downloader_task = "SRA" zero_percent_dl_job.start_time = timezone.now() zero_percent_dl_job.end_time = timezone.now() zero_percent_dl_job.success = False zero_percent_dl_job.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = zero_percent_dl_job assoc.original_file = zero_percent assoc.save() # Experiment that is 50% complete. fify_percent_experiment = Experiment(accession_code='ERP036000') fify_percent_experiment.technology = 'RNA-SEQ' fify_percent_experiment.save() organism_assoc = ExperimentOrganismAssociation.objects.create( organism=zebrafish, experiment=fify_percent_experiment) ## First sample, this one has been processed. successful_pj = ProcessorJob() successful_pj.accession_code = "ERR036000" successful_pj.pipeline_applied = "SALMON" successful_pj.ram_amount = 12288 successful_pj.start_time = timezone.now() successful_pj.end_time = timezone.now() successful_pj.success = True successful_pj.save() successful_og = OriginalFile() successful_og.filename = "ERR036000.fastq.gz" successful_og.source_filename = "ERR036000.fastq.gz" successful_og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz" successful_og.is_archive = True successful_og.save() successful_sample = Sample() successful_sample.accession_code = 'ERR036000' successful_sample.organism = zebrafish successful_sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = successful_sample assoc.original_file = successful_og assoc.save() assoc = ProcessorJobOriginalFileAssociation() assoc.processor_job = successful_pj assoc.original_file = successful_og assoc.save() assoc = ExperimentSampleAssociation() assoc.sample = successful_sample assoc.experiment = fify_percent_experiment assoc.save() ## Second sample, this one hasn't been processed. fifty_percent_unprocessed_og = OriginalFile() fifty_percent_unprocessed_og.filename = "ERR036001.fastq.gz" fifty_percent_unprocessed_og.source_filename = "ERR036001.fastq.gz" fifty_percent_unprocessed_og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036001/ERR036001_1.fastq.gz" fifty_percent_unprocessed_og.is_archive = True fifty_percent_unprocessed_og.save() fifty_percent_unprocessed_sample = Sample() fifty_percent_unprocessed_sample.accession_code = 'ERR036001' fifty_percent_unprocessed_sample.organism = zebrafish fifty_percent_unprocessed_sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = fifty_percent_unprocessed_sample assoc.original_file = fifty_percent_unprocessed_og assoc.save() assoc = ExperimentSampleAssociation() assoc.sample = fifty_percent_unprocessed_sample assoc.experiment = fify_percent_experiment assoc.save() fifty_percent_processor_job = ProcessorJob() fifty_percent_processor_job.pipeline_applied = "SALMON" fifty_percent_processor_job.accession_code = fifty_percent_unprocessed_sample.accession_code fifty_percent_processor_job.ram_amount = 12288 fifty_percent_processor_job.start_time = timezone.now() fifty_percent_processor_job.end_time = timezone.now() fifty_percent_processor_job.success = False fifty_percent_processor_job.save() assoc = ProcessorJobOriginalFileAssociation() assoc.processor_job = fifty_percent_processor_job assoc.original_file = fifty_percent_unprocessed_og assoc.save() # Setup is done, actually run the command. args = [] options = {"organism_name": "DANIO_RERIO"} call_command("organism_shepherd", *args, **options) # Verify that the jobs were called in the correct order. mock_calls = mock_send_job.mock_calls first_call_job_type = mock_calls[0][1][0] first_call_job_object = mock_calls[0][2]["job"] self.assertEqual(first_call_job_type, ProcessorPipeline.SALMON) self.assertEqual(first_call_job_object.pipeline_applied, fifty_percent_processor_job.pipeline_applied) self.assertEqual(first_call_job_object.ram_amount, fifty_percent_processor_job.ram_amount) self.assertIn(first_call_job_object.volume_index, active_volumes) fifty_percent_processor_job.refresh_from_db() self.assertEqual(first_call_job_object, fifty_percent_processor_job.retried_job) second_call_job_type = mock_calls[1][1][0] second_call_job_object = mock_calls[1][2]["job"] self.assertEqual(second_call_job_type, Downloaders.SRA) self.assertEqual(second_call_job_object.accession_code, zero_percent_dl_job.accession_code) self.assertEqual(second_call_job_object.downloader_task, zero_percent_dl_job.downloader_task) zero_percent_dl_job.refresh_from_db() self.assertEqual(second_call_job_object, zero_percent_dl_job.retried_job)
def create_samples_from_api(self, experiment: Experiment, platform_dict: Dict) -> List[Sample]: """Generates a Sample item for each sample in an AE experiment. There are many possible data situations for a sample: - If the sample only has raw data available: - If it is on a platform that we support: Download this raw data and process it - If it is not on a platform we support: Don't download anything, don't process anything - If the sample has both raw and derived data: - If the raw data is on a platform we support: Download the raw data and process it, abandon the derived data - If the raw data is not on a platform we support Download the derived data and no-op it, abandon the raw data - If the sample only has derived data: Download the derived data and no-op it. See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/samples """ created_samples = [] samples_endpoint = SAMPLES_URL.format(experiment.accession_code) r = utils.requests_retry_session().get(samples_endpoint, timeout=60) samples = r.json()["experiment"]["sample"] # The SDRF is the complete metadata record on a sample/property basis. # We run this through our harmonizer and then attach the properties # to our created samples. SDRF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt" sdrf_url = SDRF_URL_TEMPLATE.format(code=experiment.accession_code) sdrf_samples = harmony.parse_sdrf(sdrf_url) harmonized_samples = harmony.harmonize(sdrf_samples) # An experiment can have many samples for sample_data in samples: # For some reason, this sample has no files associated with it. if "file" not in sample_data or len(sample_data['file']) == 0: continue # Each sample is given an experimenatlly-unique title. flat_sample = utils.flatten(sample_data) title = harmony.extract_title(flat_sample) # A sample may actually have many sub files. # If there is raw data, take that. # If not, take the derived. has_raw = False for sub_file in sample_data['file']: # For ex: E-GEOD-15645 if isinstance(sub_file['comment'], list): sub_file_mod = sub_file sub_file_mod['comment'] = sub_file['comment'][0] else: sub_file_mod = sub_file # Some have the 'data' field, but not the actual data # Ex: E-GEOD-9656 if sub_file_mod['type'] == "data" and sub_file_mod[ 'comment'].get('value', None) != None: has_raw = True if 'raw' in sub_file_mod['comment'].get('value', ''): has_raw = True skip_sample = False for sub_file in sample_data['file']: # Don't get the raw data if it's only a 1-color sample. if 'Cy3' in str(sample_data) and 'Cy5' not in str(sample_data): has_raw = False # Skip derived data if we have it raw. if has_raw and "derived data" in sub_file['type']: continue download_url = None filename = sub_file["name"] # sub_file["comment"] is only a list if there's # more than one comment... comments = sub_file["comment"] if isinstance(comments, list): # Could be: "Derived ArrayExpress Data Matrix FTP # file" or: "ArrayExpress FTP file". If there is # no comment with a name including "FTP file" then # we don't know where to download it so we need to # mark this job as an error. Therefore don't catch # the potential exception where download_url # doesn't get defined. for comment in comments: if "FTP file" in comment["name"]: download_url = comment["value"] break else: download_url = comments["value"] if not download_url: logger.error( "Sample %s did not specify a download url, skipping.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sub_file=sub_file) skip_sample = True continue if not filename: logger.error( "Sample %s did not specify a filename, skipping.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sub_file=sub_file) skip_sample = True continue if skip_sample: continue # The accession code is not a simple matter to determine. sample_source_name = sample_data["source"].get("name", "") sample_assay_name = sample_data["assay"].get("name", "") sample_accession_code = self.determine_sample_accession( experiment.accession_code, sample_source_name, sample_assay_name, filename) # Figure out the Organism for this sample organism_name = UNKNOWN for characteristic in sample_data["characteristic"]: if characteristic["category"].upper() == "ORGANISM": organism_name = characteristic["value"].upper() if organism_name == UNKNOWN: logger.error( "Sample %s did not specify the organism name.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id) organism = None continue else: organism = Organism.get_object_for_name(organism_name) # Create the sample object try: # Associate it with the experiment, but since it # already exists it already has original files # associated with it and it's already been downloaded, # so don't add it to created_samples. sample_object = Sample.objects.get( accession_code=sample_accession_code) # If input experiment includes new protocol information, # update sample's protocol_info. existing_protocols = sample_object.protocol_info protocol_info, is_updated = self.update_sample_protocol_info( existing_protocols, experiment.protocol_description, experiment.source_url + '/protocols') if is_updated: sample_object.protocol_info = protocol_info sample_obejct.save() logger.debug( "Sample %s already exists, skipping object creation.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id) except Sample.DoesNotExist: sample_object = Sample() # The basics sample_object.source_database = "ARRAY_EXPRESS" sample_object.title = title sample_object.accession_code = sample_accession_code sample_object.source_archive_url = samples_endpoint sample_object.organism = organism sample_object.platform_name = platform_dict[ "platform_accession_name"] sample_object.platform_accession_code = platform_dict[ "platform_accession_code"] sample_object.manufacturer = platform_dict["manufacturer"] sample_object.technology = "MICROARRAY" protocol_info, is_updated = self.update_sample_protocol_info( existing_protocols=[], experiment_protocol=experiment.protocol_description, protocol_url=experiment.source_url + '/protocols') # Do not check is_updated the first time because we must # save a list so we can append to it later. sample_object.protocol_info = protocol_info sample_object.save() # Directly assign the harmonized properties harmonized_sample = harmonized_samples[title] for key, value in harmonized_sample.items(): setattr(sample_object, key, value) sample_object.save() sample_annotation = SampleAnnotation() sample_annotation.data = sample_data sample_annotation.sample = sample_object sample_annotation.is_ccdl = False sample_annotation.save() original_file = OriginalFile() original_file.filename = filename original_file.source_filename = filename original_file.source_url = download_url original_file.is_downloaded = False original_file.is_archive = True original_file.has_raw = has_raw original_file.save() original_file_sample_association = OriginalFileSampleAssociation( ) original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object original_file_sample_association.save() created_samples.append(sample_object) logger.debug( "Created " + str(sample_object), experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sample=sample_object.id) # Create associations if they don't already exist ExperimentSampleAssociation.objects.get_or_create( experiment=experiment, sample=sample_object) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment, organism=organism) return created_samples
def prepare_job(length): pj = ProcessorJob() pj.pipeline_applied = "TRANSCRIPTOME_INDEX_" + length.upper() pj.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") samp = Sample() samp.organism = homo_sapiens samp.accession_code = "derp" + length samp.save() og_file = OriginalFile() og_file.source_filename = "aegilops_tauschii_" + length + ".fa.gz" og_file.filename = "aegilops_tauschii_" + length + ".fa.gz" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/AEGILOPS_TAUSCHII/aegilops_tauschii_short.fa.gz" og_file.is_downloaded = True og_file.save() og_file2 = OriginalFile() og_file2.source_filename = "aegilops_tauschii_" + length + ".gtf.gz" og_file2.filename = "aegilops_tauschii_" + length + ".gtf.gz" og_file2.absolute_file_path = "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/AEGILOPS_TAUSCHII/aegilops_tauschii_short.gtf.gz" og_file2.is_downloaded = True og_file2.save() og_file_samp_assoc = OriginalFileSampleAssociation() og_file_samp_assoc.original_file = og_file og_file_samp_assoc.sample = samp og_file_samp_assoc.save() og_file_samp_assoc2 = OriginalFileSampleAssociation() og_file_samp_assoc2.original_file = og_file2 og_file_samp_assoc2.sample = samp og_file_samp_assoc2.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() assoc2 = ProcessorJobOriginalFileAssociation() assoc2.original_file = og_file2 assoc2.processor_job = pj assoc2.save() return pj
def test_download_aspera_and_ftp(self): """ Tests the main 'download_geo' function. """ dlj = DownloaderJob() dlj.accession_code = 'GSE22427' dlj.save() original_file = OriginalFile() original_file.source_url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427_non-normalized.txt.gz" original_file.source_filename = "GSE22427_non-normalized.txt.gz" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj assoc.save() sample = Sample() sample.accession_code = 'GSE22427' sample.save() sample_annotation = SampleAnnotation() sample_annotation.sample = sample sample_annotation.data = { 'label_protocol_ch1': 'Agilent', 'label_protocol_ch2': 'Agilent' } sample_annotation.save() og_assoc = OriginalFileSampleAssociation() og_assoc.sample = sample og_assoc.original_file = original_file og_assoc.save() LOCAL_ROOT_DIR = "/home/user/data_store" os.makedirs(LOCAL_ROOT_DIR + '/' + sample.accession_code, exist_ok=True) dl_file_path = LOCAL_ROOT_DIR + '/' + sample.accession_code + '/' + original_file.source_url.split( '/')[-1] # Aspera result = geo._download_file(original_file.source_url, file_path=dl_file_path, job=dlj, force_ftp=False) self.assertTrue(result) self.assertTrue(os.path.exists(dl_file_path)) os.remove(dl_file_path) # FTP result = geo._download_file(original_file.source_url, file_path=dl_file_path, job=dlj, force_ftp=True) self.assertTrue(result) self.assertTrue(os.path.exists(dl_file_path)) os.remove(dl_file_path) # Aspera, fail result = geo._download_file_aspera("https://rich.zone/cool_horse.jpg", target_file_path=dl_file_path, downloader_job=dlj, attempt=5) self.assertFalse(result) self.assertTrue(dlj.failure_reason != None)
def setup_experiment(new_version_accessions: List[str], old_version_accessions: List[str]) -> Dict: """ Create an experiment where some samples were processed with the newest version of salmon and other with an older one. """ # Create the experiment experiment_accession = "SRP095529" data_dir = "/home/user/data_store/" experiment_dir = data_dir + experiment_accession experiment = Experiment.objects.create(accession_code=experiment_accession, technology="RNA-SEQ") zebrafish = Organism.get_object_for_name("DANIO_RERIO") # Create the transcriptome processor and result: transcriptome_processor = Processor() transcriptome_processor.name = "Transcriptome" transcriptome_processor.version = "salmon 0.9.1" transcriptome_processor.docker_image = "dr_transcriptome" transcriptome_processor.environment = '{"some": "environment"}' transcriptome_processor.save() computational_result_short = ComputationalResult( processor=transcriptome_processor) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = zebrafish organism_index.result = computational_result_short organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT" organism_index.salmon_version = "salmon 0.9.1" organism_index.save() comp_file = ComputedFile() # This path will not be used because we already have the files extracted. comp_file.absolute_file_path = ( "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz") comp_file.result = computational_result_short comp_file.size_in_bytes = 1337 comp_file.sha1 = "ABC" comp_file.s3_key = "key" comp_file.s3_bucket = "bucket" comp_file.save() quant_processor = Processor() quant_processor.name = "Salmon Quant" quant_processor.version = "salmon 0.9.1" quant_processor.docker_image = "dr_salmon" quant_processor.environment = '{"some": "environment"}' quant_processor.save() for accession_code in old_version_accessions: sample = Sample.objects.create( accession_code=accession_code, organism=zebrafish, source_database="SRA", technology="RNA-SEQ", platform_accession_code="IlluminaHiSeq1000", ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample) original_file = OriginalFile() original_file.filename = accession_code + ".SRA" original_file.source_filename = accession_code + ".SRA" original_file.save() OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample) # Create and associate quant result and files. quant_result = ComputationalResult() quant_result.is_ccdl = True quant_result.processor = quant_processor quant_result.organism_index = organism_index # associate with OLD organism index quant_result.save() kv = ComputationalResultAnnotation() kv.data = {"index_length": "short"} kv.result = quant_result kv.is_public = True kv.save() # In prod the filename pattern will involve the timestamp # but here we're using the accession code so we can find # the archive file for the current sample. archive_filename = "result-" + accession_code + ".tar.gz" archive_file = ComputedFile() archive_file.filename = archive_filename archive_file.absolute_file_path = os.path.join(experiment_dir, archive_filename) archive_file.is_public = False archive_file.is_smashable = False archive_file.is_qc = False archive_file.result = quant_result archive_file.size_in_bytes = 12345 archive_file.save() quant_file = ComputedFile() quant_file.filename = "quant.sf" quant_file.absolute_file_path = (experiment_dir + "/quant_files/" + accession_code + "_output/quant.sf") quant_file.is_public = False quant_file.is_smashable = False quant_file.is_qc = False quant_file.result = quant_result quant_file.size_in_bytes = 12345 quant_file.s3_bucket = "bucket" quant_file.s3_key = "key" quant_file.save() SampleResultAssociation.objects.get_or_create(sample=sample, result=quant_result) # Create another OrganismIndex with a newer version of transcriptome_processor = Processor() transcriptome_processor.name = "Transcriptome" transcriptome_processor.version = "salmon 0.13.1" transcriptome_processor.docker_image = "dr_transcriptome" transcriptome_processor.environment = '{"some": "environment"}' transcriptome_processor.save() computational_result_short = ComputationalResult( processor=transcriptome_processor) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = zebrafish organism_index.result = computational_result_short organism_index.absolute_directory_path = "/home/user/data_store/ZEBRAFISH_INDEX/SHORT" organism_index.salmon_version = "salmon 0.13.1" # DIFFERENT SALMON VERSION organism_index.save() comp_file = ComputedFile() # This path will not be used because we already have the files extracted. comp_file.absolute_file_path = ( "/home/user/data_store/ZEBRAFISH_INDEX/SHORT/zebrafish_short.tar.gz") comp_file.result = computational_result_short comp_file.size_in_bytes = 1337 comp_file.sha1 = "ABC" comp_file.s3_key = "key" comp_file.s3_bucket = "bucket" comp_file.save() for accession_code in new_version_accessions: sample = Sample.objects.create( accession_code=accession_code, organism=zebrafish, source_database="SRA", technology="RNA-SEQ", platform_accession_code="IlluminaHiSeq1000", ) ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample) original_file = OriginalFile() original_file.filename = accession_code + ".SRA" original_file.source_filename = accession_code + ".SRA" original_file.save() OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample) # Create and associate quant result and files. quant_result = ComputationalResult() quant_result.is_ccdl = True quant_result.processor = quant_processor quant_result.organism_index = organism_index # NEWER VERSION quant_result.save() kv = ComputationalResultAnnotation() kv.data = {"index_length": "short"} kv.result = quant_result kv.is_public = True kv.save() # In prod the filename pattern will involve the timestamp # but here we're using the accession code so we can find # the archive file for the current sample. archive_filename = "result-" + accession_code + ".tar.gz" archive_file = ComputedFile() archive_file.filename = archive_filename archive_file.absolute_file_path = os.path.join(experiment_dir, archive_filename) archive_file.is_public = False archive_file.is_smashable = False archive_file.is_qc = False archive_file.result = quant_result archive_file.size_in_bytes = 12345 archive_file.save() quant_file = ComputedFile() quant_file.filename = "quant.sf" quant_file.absolute_file_path = (experiment_dir + "/quant_files/" + accession_code + "_output/quant.sf") quant_file.is_public = False quant_file.is_smashable = False quant_file.is_qc = False quant_file.result = quant_result quant_file.size_in_bytes = 12345 quant_file.s3_bucket = "bucket" quant_file.s3_key = "key" quant_file.save() SampleResultAssociation.objects.get_or_create(sample=sample, result=quant_result) return experiment
def test_download_geo(self, mock_send_task): """ Tests the main 'download_geo' function. """ dlj = DownloaderJob() dlj.accession_code = 'GSE22427' dlj.save() original_file = OriginalFile() original_file.filename = "GSE22427_non-normalized.txt.gz" original_file.source_url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427_non-normalized.txt.gz" original_file.source_filename = "GSE22427_non-normalized.txt.gz" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj assoc.save() sample = Sample() sample.accession_code = 'GSE22427' sample.technology = "MICROARRAY" sample.manufacturer = "AGILENT" sample.has_raw = True # This is fake, but we don't currently support any agilent # platforms so we're using a platform that is supported. sample.platform_accession_code = "Illumina_RatRef-12_V1.0" sample.save() sample_annotation = SampleAnnotation() sample_annotation.sample = sample sample_annotation.data = { 'label_protocol_ch1': 'Agilent', 'label_protocol_ch2': 'Agilent' } sample_annotation.save() og_assoc = OriginalFileSampleAssociation() og_assoc.sample = sample og_assoc.original_file = original_file og_assoc.save() download_result = geo.download_geo(dlj.id) file_assocs = OriginalFileSampleAssociation.objects.filter( sample=sample) self.assertEqual(file_assocs.count(), 2) for file_assoc in file_assocs: original_file = file_assoc.original_file if original_file.filename.endswith(".gz"): # We delete the archive after we extract from it self.assertFalse(original_file.is_downloaded) else: self.assertTrue(original_file.is_downloaded) # Make sure it worked self.assertTrue(download_result) self.assertTrue(dlj.failure_reason is None) self.assertTrue(len(ProcessorJob.objects.all()) > 0) self.assertEqual(ProcessorJob.objects.all()[0].pipeline_applied, "AGILENT_TWOCOLOR_TO_PCL") self.assertEqual(ProcessorJob.objects.all()[0].ram_amount, 2048)
def test_dharma(self): dlj1 = DownloaderJob() dlj1.accession_code = 'D1' dlj1.worker_id = get_instance_id() dlj1.start_time = datetime.datetime.now() dlj1.save() dlj2 = DownloaderJob() dlj2.accession_code = 'D2' dlj2.worker_id = get_instance_id() dlj2.start_time = datetime.datetime.now() dlj2.save() dlj3 = DownloaderJob() dlj3.accession_code = 'D3' dlj3.worker_id = get_instance_id() dlj3.save() original_file = OriginalFile() original_file.source_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MEXP/E-MEXP-433/E-MEXP-433.raw.1.zip" original_file.source_filename = "Waldhof_020604_R30_01-2753_U133A.CEL" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj3 assoc.save() sample = Sample() sample.accession_code = 'Blahblahblah' sample.technology = "MICROARRAY" sample.manufacturer = "AFFYMETRIX" sample.has_raw = True sample.platform_accession_code = "hgu133a" sample.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample, original_file=original_file) exited = False try: utils.start_job(dlj3.id, max_downloader_jobs_per_node=2, force_harakiri=True) except SystemExit as e: # This is supposed to happen! self.assertTrue(True) exited = True except Exception as e: # This isn't! self.assertTrue(False) self.assertTrue(exited) exited = False try: utils.start_job(dlj3.id, max_downloader_jobs_per_node=15, force_harakiri=True) except SystemExit as e: # This is not supposed to happen! self.assertTrue(False) exited = True except Exception as e: # This is! self.assertTrue(True) self.assertFalse(exited)
def test_download_multiple_zips(self, mock_send_job): """Tests that each sample gets one processor job no matter what. https://github.com/AlexsLemonade/refinebio/pull/351 deals with a bug where every file that was extracted to a directory got a processor job queued for it each time a downloader job ran which pointed to that directory. This test makes sure this bug stays squashed. It does so by running two downloader jobs for the same experiment which use two different zip files. Before this bug was squashed this would have resulted in the first sample getting a second processor job queued for it because the second downloader job would have found the file in the directory. """ dlj1 = DownloaderJob() dlj1.accession_code = 'E-MEXP-433' dlj1.save() original_file = OriginalFile() original_file.source_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MEXP/E-MEXP-433/E-MEXP-433.raw.1.zip" original_file.source_filename = "Waldhof_020604_R30_01-2753_U133A.CEL" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj1 assoc.save() sample = Sample() sample.accession_code = 'E-MEXP-433-Waldhof_020604_R30_01-2753_U133A' sample.technology = "MICROARRAY" sample.manufacturer = "AFFYMETRIX" sample.has_raw = True # This is fake, but we don't currently support any agilent # platforms so we're using a platform that is supported. sample.platform_accession_code = "hgu133a" sample.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample, original_file=original_file) dlj2 = DownloaderJob() dlj2.accession_code = 'E-MEXP-433' dlj2.save() original_file = OriginalFile() original_file.source_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MEXP/E-MEXP-433/E-MEXP-433.raw.2.zip" original_file.source_filename = "N08_U133A.CEL" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj2 assoc.save() sample = Sample() sample.accession_code = 'E-MEXP-433-N08_U133A' sample.technology = "MICROARRAY" sample.manufacturer = "AFFYMETRIX" sample.has_raw = True # This is fake, but we don't currently support any agilent # platforms so we're using a platform that is supported. sample.platform_accession_code = "hgu133a" sample.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample, original_file=original_file) array_express.download_array_express(dlj1.id) array_express.download_array_express(dlj2.id) self.assertEqual(ProcessorJob.objects.all().count(), 2)
def setUp(self): # Saving this for if we have protected endpoints # self.superuser = User.objects.create_superuser('john', '*****@*****.**', 'johnpassword') # self.client.login(username='******', password='******') # self.user = User.objects.create(username="******") experiment = Experiment() experiment.accession_code = "GSE000" experiment.alternate_accession_code = "E-GEOD-000" experiment.title = "NONONONO" experiment.description = "Boooooourns. Wasabi." experiment.technology = "RNA-SEQ" experiment.save() experiment = Experiment() experiment.accession_code = "GSE123" experiment.title = "Hey Ho Let's Go" experiment.description = ( "This is a very exciting test experiment. Faygo soda. Blah blah blah." ) experiment.technology = "MICROARRAY" experiment.save() self.experiment = experiment experiment_annotation = ExperimentAnnotation() experiment_annotation.data = {"hello": "world", "123": 456} experiment_annotation.experiment = experiment experiment_annotation.save() # Create 26 test organisms numbered 0-25 for pagination test, so there should be 29 organisms total (with the 3 others below) for i in range(26): Organism(name=("TEST_ORGANISM_{}".format(i)), taxonomy_id=(1234 + i)).save() ailuropoda = Organism(name="AILUROPODA_MELANOLEUCA", taxonomy_id=9646, is_scientific_name=True) ailuropoda.save() self.homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) self.homo_sapiens.save() self.danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1337, is_scientific_name=True) self.danio_rerio.save() sample = Sample() sample.title = "123" sample.accession_code = "123" sample.is_processed = True sample.organism = ailuropoda sample.save() sample = Sample() sample.title = "789" sample.accession_code = "789" sample.is_processed = True sample.organism = ailuropoda sample.save() self.sample = sample # add qn target for sample organism result = ComputationalResult() result.commands.append("create_qn_target.py") result.is_ccdl = True result.is_public = True result.processor = None result.save() cra = ComputationalResultAnnotation() cra.result = result cra.data = {"organism_id": ailuropoda.id, "is_qn": True} cra.save() ailuropoda.qn_target = result ailuropoda.save() sample_annotation = SampleAnnotation() sample_annotation.data = {"goodbye": "world", "789": 123} sample_annotation.sample = sample sample_annotation.save() original_file = OriginalFile() original_file.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = original_file original_file_sample_association.save() downloader_job = DownloaderJob() downloader_job.save() download_assoc = DownloaderJobOriginalFileAssociation() download_assoc.original_file = original_file download_assoc.downloader_job = downloader_job download_assoc.save() processor_job = ProcessorJob() processor_job.save() processor_assoc = ProcessorJobOriginalFileAssociation() processor_assoc.original_file = original_file processor_assoc.processor_job = processor_job processor_assoc.save() experiment_sample_association = ExperimentSampleAssociation() experiment_sample_association.sample = sample experiment_sample_association.experiment = experiment experiment_sample_association.save() experiment.num_total_samples = 1 experiment.num_processed_samples = 1 experiment.save() result = ComputationalResult() result.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() result = ComputationalResult() result.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() processor = Processor() processor.name = "Salmon Quant" processor.version = "v9.9.9" processor.docker_image = "dr_salmon" processor.environment = '{"some": "environment"}' processor.save() computational_result_short = ComputationalResult(processor=processor) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = self.danio_rerio organism_index.result = computational_result_short organism_index.absolute_directory_path = ( "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT") organism_index.is_public = True organism_index.s3_url = "not_blank" organism_index.save() return
def prepare_job(): pj = ProcessorJob() pj.pipeline_applied = "SALMON" pj.save() c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") samp = Sample() samp.accession_code = "SALMON" # So the test files go to the right place samp.organism = c_elegans samp.save() prepare_organism_indices() og_file = OriginalFile() og_file.source_filename = "ERR1562482_1.fastq.gz" og_file.filename = "ERR1562482_1.fastq.gz" og_file.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/ERR1562482_1.fastq.gz" og_file.save() og_file2 = OriginalFile() og_file2.source_filename = "ERR1562482_2.fastq.gz" og_file2.filename = "ERR1562482_2.fastq.gz" og_file2.absolute_file_path = "/home/user/data_store/raw/TEST/SALMON/ERR1562482_2.fastq.gz" og_file2.save() og_file_samp_assoc = OriginalFileSampleAssociation() og_file_samp_assoc.original_file = og_file og_file_samp_assoc.sample = samp og_file_samp_assoc.save() og_file_samp_assoc2 = OriginalFileSampleAssociation() og_file_samp_assoc2.original_file = og_file2 og_file_samp_assoc2.sample = samp og_file_samp_assoc2.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.processor_job = pj assoc1.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file2 assoc1.processor_job = pj assoc1.save() return pj, [og_file, og_file2]