def _generate_files(self, species: Dict) -> None: url_builder = ensembl_url_builder_factory(species) fasta_download_url = url_builder.build_transcriptome_url() gtf_download_url = url_builder.build_gtf_url() platform_accession_code = species.pop("division") self._clean_metadata(species) all_new_files = [] fasta_filename = url_builder.filename_species + ".fa.gz" original_file = OriginalFile() original_file.source_filename = fasta_filename original_file.source_url = fasta_download_url original_file.is_archive = True original_file.is_downloaded = False original_file.save() all_new_files.append(original_file) gtf_filename = url_builder.filename_species + ".gtf.gz" original_file = OriginalFile() original_file.source_filename = gtf_filename original_file.source_url = gtf_download_url original_file.is_archive = True original_file.is_downloaded = False original_file.save() all_new_files.append(original_file) return all_new_files
def prepare_original_files(length: str) -> List[OriginalFile]: og_file = OriginalFile() og_file.source_filename = "aegilops_tauschii_" + length + ".fa.gz" og_file.filename = "aegilops_tauschii_" + length + ".fa.gz" og_file.absolute_file_path = ( "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/" "AEGILOPS_TAUSCHII/aegilops_tauschii_short.fa.gz") og_file.source_url = ( "ftp://ftp.ensemblgenomes.org/pub/release-39/plants/fasta/" "aegilops_tauschii/dna/Aegilops_tauschii.ASM34733v1.dna.toplevel.fa.gz" ) og_file.is_downloaded = True # We need to add the URL here so that _extract_assembly_information works properly og_file.source_url = "ftp://ftp.ensemblgenomes.org/pub/release-39/plants/fasta/aegilops_tauschii/dna/Aegilops_tauschii.ASM34733v1.dna.toplevel.fa.gz" og_file.save() og_file2 = OriginalFile() og_file2.source_filename = "aegilops_tauschii_" + length + ".gtf.gz" og_file2.filename = "aegilops_tauschii_" + length + ".gtf.gz" og_file2.absolute_file_path = ( "/home/user/data_store/raw/TEST/TRANSCRIPTOME_INDEX/" "AEGILOPS_TAUSCHII/aegilops_tauschii_short.gtf.gz") og_file2.source_url = ( "ftp://ftp.ensemblgenomes.org/pub/release-39/plants/gtf/" "aegilops_tauschii/Aegilops_tauschii.ASM34733v1.39.gtf.gz") og_file2.is_downloaded = True # We need to add the URL here so that _extract_assembly_information works properly og_file2.source_url = "ftp://ftp.ensemblgenomes.org/pub/release-39/plants/gtf/aegilops_tauschii/Aegilops_tauschii.ASM34733v1.39.gtf.gz" og_file2.save() return [og_file, og_file2]
def _generate_files(self, species: Dict) -> None: url_builder = ensembl_url_builder_factory(species) fasta_download_url = url_builder.build_transcriptome_url() gtf_download_url = url_builder.build_gtf_url() # Getting the object will ensure it is created in the DB. Organism.get_or_create_object_for_id(url_builder.taxonomy_id) all_new_files = [] fasta_filename = url_builder.filename_species + ".fa.gz" original_file = OriginalFile() original_file.source_filename = fasta_filename original_file.source_url = fasta_download_url original_file.is_archive = True original_file.is_downloaded = False original_file.save() all_new_files.append(original_file) gtf_filename = url_builder.filename_species + ".gtf.gz" original_file = OriginalFile() original_file.source_filename = gtf_filename original_file.source_url = gtf_download_url original_file.is_archive = True original_file.is_downloaded = False original_file.save() all_new_files.append(original_file) return all_new_files
def test_download_file_swapper(self, mock_send_job): mock_send_job.return_value = None dlj = DownloaderJob() dlj.accession_code = "DRR002116" dlj.save() og = OriginalFile() og.source_filename = "DRR002116.sra" og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/DRR/DRR002/DRR002116/DRR002116.sra" og.is_archive = True og.save() sample = Sample() sample.accession_code = 'DRR002116' sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() result = sra._download_file(og.source_url, dlj, "/tmp", force_ftp=False) self.assertTrue(result)
def _replace_dotsra_with_fastq_files( sample: Sample, downloader_job: DownloaderJob, original_file: OriginalFile) -> List[OriginalFile]: """Replaces a .SRA file with two .fastq files. This function should only be called on a sample which has unmated reads, so it makes the assumption that the sample passed into it has at least two read files in ENA. """ read_one_url = _build_ena_file_url(sample.accession_code, "_1") read_two_url = _build_ena_file_url(sample.accession_code, "_2") # Technically this is a different file, but deleting this one and # its associations just to recreate another with the same # associations seems rather pointless. original_file.source_url = read_one_url original_file.source_filename = read_one_url.split("/")[-1] original_file.save() read_two_original_file = OriginalFile.objects.get_or_create( source_url=read_two_url, source_filename=read_two_url.split("/")[-1], has_raw=True)[0] OriginalFileSampleAssociation.objects.get_or_create( original_file=read_two_original_file, sample=sample) DownloaderJobOriginalFileAssociation.objects.get_or_create( original_file=read_two_original_file, downloader_job=downloader_job) return [original_file, read_two_original_file]
def test_download_file_ncbi(self, mock_send_job): mock_send_job.return_value = None dlj = DownloaderJob() dlj.accession_code = "DRR002116" dlj.save() og = OriginalFile() og.source_filename = "DRR002116.sra" og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/DRR/DRR002/DRR002116/DRR002116.sra" og.is_archive = True og.save() sample = Sample() sample.accession_code = 'DRR002116' sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() result, downloaded_files = sra.download_sra(dlj.pk) utils.end_downloader_job(dlj, result) self.assertTrue(result) self.assertEqual(downloaded_files[0].sha1, 'd5374e7fe047d4f76b165c3f5148ab2df9d42cea') self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
def test_download_file(self, mock_send_job): mock_send_job.return_value = None dlj = DownloaderJob() dlj.accession_code = "ERR036" dlj.save() og = OriginalFile() og.source_filename = "ERR036000.fastq.gz" og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz" og.is_archive = True og.save() sample = Sample() sample.accession_code = 'ERR036000' sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() success = sra.download_sra(dlj.pk)
def test_download_file_ncbi(self): dlj = DownloaderJob() dlj.accession_code = "SRR9117853" dlj.save() og = OriginalFile() og.source_filename = "SRR9117853.sra" og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/SRR/SRR9117/SRR9117853/SRR9117853.sra" og.is_archive = True og.save() sample = Sample() sample.accession_code = "SRR9117853" sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() result, downloaded_files = sra.download_sra(dlj.pk) utils.end_downloader_job(dlj, result) self.assertTrue(result) self.assertEqual(downloaded_files[0].sha1, "e7ad484fe6f134ba7d1b2664e58cc15ae5a958cc") self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
def test_download_file(self): dlj = DownloaderJob() dlj.accession_code = "ERR036" dlj.save() og = OriginalFile() og.source_filename = "ERR036000.fastq.gz" og.source_url = "ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz" og.is_archive = True og.save() sample = Sample() sample.accession_code = "ERR036000" sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() result, downloaded_files = sra.download_sra(dlj.pk) self.assertTrue(result) self.assertEqual(downloaded_files[0].sha1, "1dfe5460a4101fe87feeffec0cb2e053f6695961") self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
def test_assembly_information(self): og_file = OriginalFile() og_file.source_url = "ftp://ftp.ensemblgenomes.org/pub/release-39/plants/fasta/aegilops_tauschii/dna/Aegilops_tauschii.ASM34733v1.dna.toplevel.fa.gz" og_file.source_filename = "aegilops_tauschii_short.fa.gz" og_file2 = OriginalFile() og_file2.source_url = "ftp://ftp.ensemblgenomes.org/pub/release-39/plants/gtf/aegilops_tauschii/Aegilops_tauschii.ASM34733v1.39.gtf.gz" og_file2.source_filename = "aegilops_tauschii_short.gtf.gz" job_context = { "original_files": [og_file, og_file2], "computed_files": [] } job_context = transcriptome_index._extract_assembly_information( job_context) self.assertEqual("39", job_context["assembly_version"]) self.assertEqual("ASM34733v1", job_context["assembly_name"])
def _get_actual_file_if_queueable( extracted_subfile: Dict, original_file: OriginalFile, samples: List[Sample]) -> OriginalFile: """Returns the actual file from the archive if it should be queued. If the file has been processed or has an unstarted DownloaderJob, None will be returned. `extracted_subfile` should be a Dict containing metadata about the file that was extracted from an archive. `original_file` should be the file associated with the CURRENT DownloaderJob. `samples` are the samples that the actual file should be associated with if it has to be created. """ # Check to see if we've made this original file before: potential_existing_files = OriginalFile.objects.filter( source_filename=original_file.source_filename, filename=extracted_subfile['filename'], is_archive=False ) if potential_existing_files.count() > 0: # We've already created this record, let's see if we actually # needed to download it or if we just got it because we needed # a file in the same archive. actual_file = potential_existing_files[0] if actual_file.needs_processing(): if not actual_file.is_downloaded: actual_file.is_downloaded = True actual_file.save() return actual_file else: return None else: actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = extracted_subfile['absolute_path'] actual_file.filename = extracted_subfile['filename'] actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = True actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() for sample in samples: original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = actual_file original_file_sample_association.save() return actual_file
def test_download_file_unmated_reads(self): dlj = DownloaderJob() dlj.accession_code = "SRR1603661" dlj.save() og_1 = OriginalFile() og_1.source_filename = "SRR1603661_1.fastq.gz" og_1.source_url = "ftp.sra.ebi.ac.uk/vol1/fastq/SRR160/001/SRR1603661/SRR1603661_1.fastq.gz" og_1.expected_md5 = "502a9a482bfa5aa75865ccc0105ad13c" og_1.expected_size_in_bytes = 6751980628 og_1.is_archive = True og_1.save() og_2 = OriginalFile() og_2.source_filename = "SRR1603661_2.fastq.gz" og_2.source_url = "ftp.sra.ebi.ac.uk/vol1/fastq/SRR160/001/SRR1603661/SRR1603661_2.fastq.gz" og_1.expected_md5 = "fffd24457418d255991f54ec82a39d57" og_1.expected_size_in_bytes = 6949912932 og_2.is_archive = True og_2.save() sample = Sample() sample.accession_code = "SRR1603661" sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og_1 assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og_1 assoc.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og_2 assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og_2 assoc.save() result, downloaded_files = sra.download_sra(dlj.pk) utils.end_downloader_job(dlj, result) self.assertTrue(result) self.assertEqual(downloaded_files[0].sha1, "52bf22472069d04fa7767429f6ab78ebd10c0152") self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
def test_download_file(self, mock_send_job): mock_send_job.return_value = None dlj = DownloaderJob() dlj.save() og = OriginalFile() og.source_filename = "Aegilops_tauschii.ASM34733v1.37.gtf.gz" og.source_url = self.gtf_download_url og.is_archive = True og.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() transcriptome_index.download_transcriptome(dlj.pk)
def test_no_rnaseq(self): """Makes sure that no RNA-Seq data gets downloaded even if there's a job for it. """ dlj = DownloaderJob() dlj.accession_code = 'GSE103217' dlj.save() original_file = OriginalFile() original_file.filename = "GSE103217_family.xml.tgz" original_file.source_url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE103nnn/GSE103217/miniml/GSE103217_family.xml.tgz" original_file.source_filename = "GSE103217_family.xml.tgz" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj assoc.save() sample = Sample() sample.accession_code = 'GSE103217' sample.technology = "RNA-SEQ" sample.manufacturer = "ILLUMINA" sample.platform_accession_code = "Illumina HiSeq 2500" sample.save() og_assoc = OriginalFileSampleAssociation() og_assoc.sample = sample og_assoc.original_file = original_file og_assoc.save() download_result = geo.download_geo(dlj.id) self.assertFalse(download_result) dlj.refresh_from_db() self.assertFalse(dlj.success) # It's not necessarily that we didn't extract any files, but # none that were usable so it looks like none. self.assertEqual(dlj.failure_reason, "Failed to extract any downloaded files.")
def test_download_file_swapper(self): dlj = DownloaderJob() dlj.accession_code = "SRR9117853" dlj.save() og = OriginalFile() og.source_filename = "SRR9117853.sra" og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/SRR/SRR9117/SRR9117853/SRR9117853.sra" og.is_archive = True og.save() sample = Sample() sample.accession_code = "SRR9117853" sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() result = sra._download_file(og.source_url, dlj, "/tmp/doomed", force_ftp=False) self.assertTrue(result)
def _download_file(original_file: OriginalFile, downloader_job: DownloaderJob, target_file_path: str) -> bool: """ Download file dispatcher. Dispatches to the HTTP or Aspera downloader """ download_url = original_file.source_url # SRA files have Apsera downloads. if "ftp.sra.ebi.ac.uk" in download_url: # From: ftp.sra.ebi.ac.uk/vol1/fastq/SRR735/005/SRR7353755/SRR7353755_1.fastq.gz # To: [email protected]:/vol1/fastq/SRR735/005/SRR7353755/SRR7353755_1.fastq.gz download_url = download_url.replace("ftp", "era-fasp@fasp") download_url = download_url.replace(".uk/", ".uk:/") original_file.source_url = download_url return _download_file_aspera(download_url, downloader_job, target_file_path, 0, original_file, source="ENA") elif "ncbi.nlm.nih.gov" in download_url: # Try to convert old-style endpoints into new-style endpoints if possible try: if "anonftp" in download_url or "dbtest" in download_url: accession = download_url.split("/")[-1].split(".sra")[0] new_url = get_https_sra_download(accession) if new_url: download_url = new_url except Exception: pass return _download_file_http(download_url, downloader_job, target_file_path) else: downloader_job.failure_reason = ( "Unrecognized URL pattern: {}").format(download_url) return False return True
def test_download_multiple_zips(self, mock_send_job): """Tests that each sample gets one processor job no matter what. https://github.com/AlexsLemonade/refinebio/pull/351 deals with a bug where every file that was extracted to a directory got a processor job queued for it each time a downloader job ran which pointed to that directory. This test makes sure this bug stays squashed. It does so by running two downloader jobs for the same experiment which use two different zip files. Before this bug was squashed this would have resulted in the first sample getting a second processor job queued for it because the second downloader job would have found the file in the directory. """ dlj1 = DownloaderJob() dlj1.accession_code = 'E-MEXP-433' dlj1.save() original_file = OriginalFile() original_file.source_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MEXP/E-MEXP-433/E-MEXP-433.raw.1.zip" original_file.source_filename = "Waldhof_020604_R30_01-2753_U133A.CEL" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj1 assoc.save() sample = Sample() sample.accession_code = 'E-MEXP-433-Waldhof_020604_R30_01-2753_U133A' sample.technology = "MICROARRAY" sample.manufacturer = "AFFYMETRIX" sample.has_raw = True # This is fake, but we don't currently support any agilent # platforms so we're using a platform that is supported. sample.platform_accession_code = "hgu133a" sample.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample, original_file=original_file) dlj2 = DownloaderJob() dlj2.accession_code = 'E-MEXP-433' dlj2.save() original_file = OriginalFile() original_file.source_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MEXP/E-MEXP-433/E-MEXP-433.raw.2.zip" original_file.source_filename = "N08_U133A.CEL" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj2 assoc.save() sample = Sample() sample.accession_code = 'E-MEXP-433-N08_U133A' sample.technology = "MICROARRAY" sample.manufacturer = "AFFYMETRIX" sample.has_raw = True # This is fake, but we don't currently support any agilent # platforms so we're using a platform that is supported. sample.platform_accession_code = "hgu133a" sample.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample, original_file=original_file) array_express.download_array_express(dlj1.id) array_express.download_array_express(dlj2.id) self.assertEqual(ProcessorJob.objects.all().count(), 2)
def test_organism_shepherd_command(self, mock_nomad, mock_send_job, mock_get_active_volumes): """Tests that the organism shepherd requeues jobs in the right order. The situation we're setting up is basically this: * There are two experiments. * One of them has 1/2 samples processed, the other 0/1 * One of them needs a DownloaderJob requeued and the other needs a ProcessorJob requued. And what we're going to test for is: * Both of the jobs that need to be requeued are requeued. * The experiment with a processed sample is requeued first because it has a higher completion percentage. """ # First, set up our mocks to prevent network calls. mock_send_job.return_value = True active_volumes = {"1", "2", "3"} mock_get_active_volumes.return_value = active_volumes def mock_init_nomad(host, port=0, timeout=0): ret_value = MagicMock() ret_value.jobs = MagicMock() ret_value.jobs.get_jobs = MagicMock() ret_value.jobs.get_jobs.side_effect = lambda: [] return ret_value mock_nomad.side_effect = mock_init_nomad zebrafish = Organism(name="DANIO_RERIO", taxonomy_id=1337, is_scientific_name=True) zebrafish.save() # Experiment that is 0% complete. zero_percent_experiment = Experiment(accession_code='ERP037000') zero_percent_experiment.technology = 'RNA-SEQ' zero_percent_experiment.save() organism_assoc = ExperimentOrganismAssociation.objects.create( organism=zebrafish, experiment=zero_percent_experiment) zero_percent = OriginalFile() zero_percent.filename = "ERR037001.fastq.gz" zero_percent.source_filename = "ERR037001.fastq.gz" zero_percent.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR037/ERR037001/ERR037001_1.fastq.gz" zero_percent.is_archive = True zero_percent.save() zero_percent_sample = Sample() zero_percent_sample.accession_code = 'ERR037001' zero_percent_sample.organism = zebrafish zero_percent_sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = zero_percent_sample assoc.original_file = zero_percent assoc.save() assoc = ExperimentSampleAssociation() assoc.sample = zero_percent_sample assoc.experiment = zero_percent_experiment assoc.save() # TODO: fix names of all the variables to be appropriate for this test case. zero_percent_dl_job = DownloaderJob() zero_percent_dl_job.accession_code = zero_percent_sample.accession_code zero_percent_dl_job.downloader_task = "SRA" zero_percent_dl_job.start_time = timezone.now() zero_percent_dl_job.end_time = timezone.now() zero_percent_dl_job.success = False zero_percent_dl_job.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = zero_percent_dl_job assoc.original_file = zero_percent assoc.save() # Experiment that is 50% complete. fify_percent_experiment = Experiment(accession_code='ERP036000') fify_percent_experiment.technology = 'RNA-SEQ' fify_percent_experiment.save() organism_assoc = ExperimentOrganismAssociation.objects.create( organism=zebrafish, experiment=fify_percent_experiment) ## First sample, this one has been processed. successful_pj = ProcessorJob() successful_pj.accession_code = "ERR036000" successful_pj.pipeline_applied = "SALMON" successful_pj.ram_amount = 12288 successful_pj.start_time = timezone.now() successful_pj.end_time = timezone.now() successful_pj.success = True successful_pj.save() successful_og = OriginalFile() successful_og.filename = "ERR036000.fastq.gz" successful_og.source_filename = "ERR036000.fastq.gz" successful_og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz" successful_og.is_archive = True successful_og.save() successful_sample = Sample() successful_sample.accession_code = 'ERR036000' successful_sample.organism = zebrafish successful_sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = successful_sample assoc.original_file = successful_og assoc.save() assoc = ProcessorJobOriginalFileAssociation() assoc.processor_job = successful_pj assoc.original_file = successful_og assoc.save() assoc = ExperimentSampleAssociation() assoc.sample = successful_sample assoc.experiment = fify_percent_experiment assoc.save() ## Second sample, this one hasn't been processed. fifty_percent_unprocessed_og = OriginalFile() fifty_percent_unprocessed_og.filename = "ERR036001.fastq.gz" fifty_percent_unprocessed_og.source_filename = "ERR036001.fastq.gz" fifty_percent_unprocessed_og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036001/ERR036001_1.fastq.gz" fifty_percent_unprocessed_og.is_archive = True fifty_percent_unprocessed_og.save() fifty_percent_unprocessed_sample = Sample() fifty_percent_unprocessed_sample.accession_code = 'ERR036001' fifty_percent_unprocessed_sample.organism = zebrafish fifty_percent_unprocessed_sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = fifty_percent_unprocessed_sample assoc.original_file = fifty_percent_unprocessed_og assoc.save() assoc = ExperimentSampleAssociation() assoc.sample = fifty_percent_unprocessed_sample assoc.experiment = fify_percent_experiment assoc.save() fifty_percent_processor_job = ProcessorJob() fifty_percent_processor_job.pipeline_applied = "SALMON" fifty_percent_processor_job.accession_code = fifty_percent_unprocessed_sample.accession_code fifty_percent_processor_job.ram_amount = 12288 fifty_percent_processor_job.start_time = timezone.now() fifty_percent_processor_job.end_time = timezone.now() fifty_percent_processor_job.success = False fifty_percent_processor_job.save() assoc = ProcessorJobOriginalFileAssociation() assoc.processor_job = fifty_percent_processor_job assoc.original_file = fifty_percent_unprocessed_og assoc.save() # Setup is done, actually run the command. args = [] options = {"organism_name": "DANIO_RERIO"} call_command("organism_shepherd", *args, **options) # Verify that the jobs were called in the correct order. mock_calls = mock_send_job.mock_calls first_call_job_type = mock_calls[0][1][0] first_call_job_object = mock_calls[0][2]["job"] self.assertEqual(first_call_job_type, ProcessorPipeline.SALMON) self.assertEqual(first_call_job_object.pipeline_applied, fifty_percent_processor_job.pipeline_applied) self.assertEqual(first_call_job_object.ram_amount, fifty_percent_processor_job.ram_amount) self.assertIn(first_call_job_object.volume_index, active_volumes) fifty_percent_processor_job.refresh_from_db() self.assertEqual(first_call_job_object, fifty_percent_processor_job.retried_job) second_call_job_type = mock_calls[1][1][0] second_call_job_object = mock_calls[1][2]["job"] self.assertEqual(second_call_job_type, Downloaders.SRA) self.assertEqual(second_call_job_object.accession_code, zero_percent_dl_job.accession_code) self.assertEqual(second_call_job_object.downloader_task, zero_percent_dl_job.downloader_task) zero_percent_dl_job.refresh_from_db() self.assertEqual(second_call_job_object, zero_percent_dl_job.retried_job)
def create_samples_from_api(self, experiment: Experiment, platform_dict: Dict) -> List[Sample]: """Generates a Sample item for each sample in an AE experiment. There are many possible data situations for a sample: - If the sample only has raw data available: - If it is on a platform that we support: Download this raw data and process it - If it is not on a platform we support: Don't download anything, don't process anything - If the sample has both raw and derived data: - If the raw data is on a platform we support: Download the raw data and process it, abandon the derived data - If the raw data is not on a platform we support Download the derived data and no-op it, abandon the raw data - If the sample only has derived data: Download the derived data and no-op it. See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/samples """ created_samples = [] samples_endpoint = SAMPLES_URL.format(experiment.accession_code) r = utils.requests_retry_session().get(samples_endpoint, timeout=60) samples = r.json()["experiment"]["sample"] # The SDRF is the complete metadata record on a sample/property basis. # We run this through our harmonizer and then attach the properties # to our created samples. SDRF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt" sdrf_url = SDRF_URL_TEMPLATE.format(code=experiment.accession_code) sdrf_samples = harmony.parse_sdrf(sdrf_url) harmonized_samples = harmony.harmonize(sdrf_samples) # An experiment can have many samples for sample_data in samples: # For some reason, this sample has no files associated with it. if "file" not in sample_data or len(sample_data['file']) == 0: continue # Each sample is given an experimenatlly-unique title. flat_sample = utils.flatten(sample_data) title = harmony.extract_title(flat_sample) # A sample may actually have many sub files. # If there is raw data, take that. # If not, take the derived. has_raw = False for sub_file in sample_data['file']: # For ex: E-GEOD-15645 if isinstance(sub_file['comment'], list): sub_file_mod = sub_file sub_file_mod['comment'] = sub_file['comment'][0] else: sub_file_mod = sub_file # Some have the 'data' field, but not the actual data # Ex: E-GEOD-9656 if sub_file_mod['type'] == "data" and sub_file_mod[ 'comment'].get('value', None) != None: has_raw = True if 'raw' in sub_file_mod['comment'].get('value', ''): has_raw = True skip_sample = False for sub_file in sample_data['file']: # Don't get the raw data if it's only a 1-color sample. if 'Cy3' in str(sample_data) and 'Cy5' not in str(sample_data): has_raw = False # Skip derived data if we have it raw. if has_raw and "derived data" in sub_file['type']: continue download_url = None filename = sub_file["name"] # sub_file["comment"] is only a list if there's # more than one comment... comments = sub_file["comment"] if isinstance(comments, list): # Could be: "Derived ArrayExpress Data Matrix FTP # file" or: "ArrayExpress FTP file". If there is # no comment with a name including "FTP file" then # we don't know where to download it so we need to # mark this job as an error. Therefore don't catch # the potential exception where download_url # doesn't get defined. for comment in comments: if "FTP file" in comment["name"]: download_url = comment["value"] break else: download_url = comments["value"] if not download_url: logger.error( "Sample %s did not specify a download url, skipping.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sub_file=sub_file) skip_sample = True continue if not filename: logger.error( "Sample %s did not specify a filename, skipping.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sub_file=sub_file) skip_sample = True continue if skip_sample: continue # The accession code is not a simple matter to determine. sample_source_name = sample_data["source"].get("name", "") sample_assay_name = sample_data["assay"].get("name", "") sample_accession_code = self.determine_sample_accession( experiment.accession_code, sample_source_name, sample_assay_name, filename) # Figure out the Organism for this sample organism_name = UNKNOWN for characteristic in sample_data["characteristic"]: if characteristic["category"].upper() == "ORGANISM": organism_name = characteristic["value"].upper() if organism_name == UNKNOWN: logger.error( "Sample %s did not specify the organism name.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id) organism = None continue else: organism = Organism.get_object_for_name(organism_name) # Create the sample object try: # Associate it with the experiment, but since it # already exists it already has original files # associated with it and it's already been downloaded, # so don't add it to created_samples. sample_object = Sample.objects.get( accession_code=sample_accession_code) # If input experiment includes new protocol information, # update sample's protocol_info. existing_protocols = sample_object.protocol_info protocol_info, is_updated = self.update_sample_protocol_info( existing_protocols, experiment.protocol_description, experiment.source_url + '/protocols') if is_updated: sample_object.protocol_info = protocol_info sample_obejct.save() logger.debug( "Sample %s already exists, skipping object creation.", sample_accession_code, experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id) except Sample.DoesNotExist: sample_object = Sample() # The basics sample_object.source_database = "ARRAY_EXPRESS" sample_object.title = title sample_object.accession_code = sample_accession_code sample_object.source_archive_url = samples_endpoint sample_object.organism = organism sample_object.platform_name = platform_dict[ "platform_accession_name"] sample_object.platform_accession_code = platform_dict[ "platform_accession_code"] sample_object.manufacturer = platform_dict["manufacturer"] sample_object.technology = "MICROARRAY" protocol_info, is_updated = self.update_sample_protocol_info( existing_protocols=[], experiment_protocol=experiment.protocol_description, protocol_url=experiment.source_url + '/protocols') # Do not check is_updated the first time because we must # save a list so we can append to it later. sample_object.protocol_info = protocol_info sample_object.save() # Directly assign the harmonized properties harmonized_sample = harmonized_samples[title] for key, value in harmonized_sample.items(): setattr(sample_object, key, value) sample_object.save() sample_annotation = SampleAnnotation() sample_annotation.data = sample_data sample_annotation.sample = sample_object sample_annotation.is_ccdl = False sample_annotation.save() original_file = OriginalFile() original_file.filename = filename original_file.source_filename = filename original_file.source_url = download_url original_file.is_downloaded = False original_file.is_archive = True original_file.has_raw = has_raw original_file.save() original_file_sample_association = OriginalFileSampleAssociation( ) original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object original_file_sample_association.save() created_samples.append(sample_object) logger.debug( "Created " + str(sample_object), experiment_accession_code=experiment.accession_code, survey_job=self.survey_job.id, sample=sample_object.id) # Create associations if they don't already exist ExperimentSampleAssociation.objects.get_or_create( experiment=experiment, sample=sample_object) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment, organism=organism) return created_samples
def test_download_aspera_and_ftp(self): """ Tests the main 'download_geo' function. """ dlj = DownloaderJob() dlj.accession_code = 'GSE22427' dlj.save() original_file = OriginalFile() original_file.source_url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427_non-normalized.txt.gz" original_file.source_filename = "GSE22427_non-normalized.txt.gz" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj assoc.save() sample = Sample() sample.accession_code = 'GSE22427' sample.save() sample_annotation = SampleAnnotation() sample_annotation.sample = sample sample_annotation.data = { 'label_protocol_ch1': 'Agilent', 'label_protocol_ch2': 'Agilent' } sample_annotation.save() og_assoc = OriginalFileSampleAssociation() og_assoc.sample = sample og_assoc.original_file = original_file og_assoc.save() LOCAL_ROOT_DIR = "/home/user/data_store" os.makedirs(LOCAL_ROOT_DIR + '/' + sample.accession_code, exist_ok=True) dl_file_path = LOCAL_ROOT_DIR + '/' + sample.accession_code + '/' + original_file.source_url.split( '/')[-1] # Aspera result = geo._download_file(original_file.source_url, file_path=dl_file_path, job=dlj, force_ftp=False) self.assertTrue(result) self.assertTrue(os.path.exists(dl_file_path)) os.remove(dl_file_path) # FTP result = geo._download_file(original_file.source_url, file_path=dl_file_path, job=dlj, force_ftp=True) self.assertTrue(result) self.assertTrue(os.path.exists(dl_file_path)) os.remove(dl_file_path) # Aspera, fail result = geo._download_file_aspera("https://rich.zone/cool_horse.jpg", target_file_path=dl_file_path, downloader_job=dlj, attempt=5) self.assertFalse(result) self.assertTrue(dlj.failure_reason != None)
def test_download_geo(self, mock_send_task): """ Tests the main 'download_geo' function. """ dlj = DownloaderJob() dlj.accession_code = 'GSE22427' dlj.save() original_file = OriginalFile() original_file.filename = "GSE22427_non-normalized.txt.gz" original_file.source_url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427_non-normalized.txt.gz" original_file.source_filename = "GSE22427_non-normalized.txt.gz" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj assoc.save() sample = Sample() sample.accession_code = 'GSE22427' sample.technology = "MICROARRAY" sample.manufacturer = "AGILENT" sample.has_raw = True # This is fake, but we don't currently support any agilent # platforms so we're using a platform that is supported. sample.platform_accession_code = "Illumina_RatRef-12_V1.0" sample.save() sample_annotation = SampleAnnotation() sample_annotation.sample = sample sample_annotation.data = { 'label_protocol_ch1': 'Agilent', 'label_protocol_ch2': 'Agilent' } sample_annotation.save() og_assoc = OriginalFileSampleAssociation() og_assoc.sample = sample og_assoc.original_file = original_file og_assoc.save() download_result = geo.download_geo(dlj.id) file_assocs = OriginalFileSampleAssociation.objects.filter( sample=sample) self.assertEqual(file_assocs.count(), 2) for file_assoc in file_assocs: original_file = file_assoc.original_file if original_file.filename.endswith(".gz"): # We delete the archive after we extract from it self.assertFalse(original_file.is_downloaded) else: self.assertTrue(original_file.is_downloaded) # Make sure it worked self.assertTrue(download_result) self.assertTrue(dlj.failure_reason is None) self.assertTrue(len(ProcessorJob.objects.all()) > 0) self.assertEqual(ProcessorJob.objects.all()[0].pipeline_applied, "AGILENT_TWOCOLOR_TO_PCL") self.assertEqual(ProcessorJob.objects.all()[0].ram_amount, 2048)
def download_geo(job_id: int) -> None: """The main function for the GEO Downloader. Downloads a single tar file containing the files representing samples relating to a single experiement stored in GEO. """ job = utils.start_job(job_id) file_assocs = DownloaderJobOriginalFileAssociation.objects.filter( downloader_job=job) original_file = file_assocs[0].original_file url = original_file.source_url accession_code = job.accession_code sample_assocs = OriginalFileSampleAssociation.objects.filter( original_file=original_file) related_samples = Sample.objects.filter( id__in=sample_assocs.values('sample_id')) # First, download the sample archive URL. # Then, unpack all the ones downloaded. # Then create processor jobs! # The files for all of the samples are # contained within the same zip file. Therefore only # download the one. os.makedirs(LOCAL_ROOT_DIR + '/' + accession_code, exist_ok=True) dl_file_path = LOCAL_ROOT_DIR + '/' + accession_code + '/' + url.split( '/')[-1] logger.debug("Starting to download: " + url, job_id=job_id, accession_code=accession_code) _download_file(url, dl_file_path, job) original_file.absolute_file_path = dl_file_path original_file.is_downloaded = True original_file.save() has_raw = True unpacked_sample_files = [] # These files are tarred, and also subsequently gzipped if '.tar' in dl_file_path: try: extracted_files = _extract_tar(dl_file_path, accession_code) except Exception as e: job.failure_reason = e logger.exception("Error occured while extracting tar file.", path=dl_file_path, exception=str(e)) utils.end_downloader_job(job, success=False) return for og_file in extracted_files: filename = og_file['filename'] if '_' in filename: sample_id = filename.split('_')[0] else: sample_id = filename.split('.')[0] try: sample = Sample.objects.get(accession_code=sample_id) except Exception as e: # We don't have this sample, but it's not a total failure. This happens. continue try: # Files from the GEO supplemental file are gzipped inside of the tarball. Great! archive_file = OriginalFile.objects.get( source_filename__contains=sample_id) archive_file.is_downloaded = True archive_file.is_archive = True archive_file.absolute_file_path = og_file['absolute_path'] archive_file.calculate_size() archive_file.calculate_sha1() archive_file.save() if '.gz' in og_file['filename']: extracted_subfile = _extract_gz(og_file['absolute_path'], accession_code) else: extracted_subfile = [og_file] actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = extracted_subfile[0][ 'absolute_path'] actual_file.filename = extracted_subfile[0]['filename'] actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = True actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() original_file_sample_association = OriginalFileSampleAssociation( ) original_file_sample_association.sample = sample original_file_sample_association.original_file = actual_file original_file_sample_association.save() archive_file.delete_local_file() archive_file.is_downloaded = False archive_file.save() unpacked_sample_files.append(actual_file) except Exception as e: # TODO - is this worth failing a job for? logger.debug( "Found a file we didn't have an OriginalFile for! Why did this happen?: " + og_file['filename'], exc_info=1, file=og_file['filename'], sample_id=sample_id, accession_code=accession_code) # If we don't know why we have it, get rid of it. os.remove(og_file["absolute_path"]) # This is a .tgz file. elif '.tgz' in dl_file_path: # If this is the MINiML file, it has been preprocessed if '_family.xml.tgz' in dl_file_path: has_raw = False try: extracted_files = _extract_tgz(dl_file_path, accession_code) except Exception as e: job.failure_reason = e logger.exception("Error occured while extracting tgz file.", path=dl_file_path, exception=str(e)) utils.end_downloader_job(job, success=False) return for og_file in extracted_files: if '.txt' in og_file['filename']: try: gsm_id = og_file['filename'].split('-')[0] sample = Sample.objects.get(accession_code=gsm_id) except Exception as e: os.remove(og_file["absolute_path"]) continue actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = og_file['absolute_path'] actual_file.filename = og_file['filename'] actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = has_raw actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() original_file_sample_association = OriginalFileSampleAssociation( ) original_file_sample_association.sample = sample original_file_sample_association.original_file = actual_file original_file_sample_association.save() unpacked_sample_files.append(actual_file) # These files are only gzipped. # These are generally the _actually_ raw (rather than the non-raw data in a RAW file) data elif '.gz' in dl_file_path: try: extracted_files = _extract_gz(dl_file_path, accession_code) except Exception as e: job.failure_reason = e logger.exception("Error occured while extracting gz file.", path=dl_file_path, exception=str(e)) utils.end_downloader_job(job, success=False) return for og_file in extracted_files: filename = og_file['filename'] sample_id = filename.split('.')[0] try: # The archive we downloaded archive_file = OriginalFile.objects.get( source_filename__contains=filename) archive_file.is_downloaded = True archive_file.is_archive = True archive_file.absolute_file_path = dl_file_path archive_file.calculate_size() archive_file.calculate_sha1() archive_file.save() actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = og_file['absolute_path'] actual_file.filename = og_file['filename'] actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = True actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() for sample in related_samples: new_association = OriginalFileSampleAssociation() new_association.original_file = actual_file new_association.sample = sample new_association.save() archive_file.delete_local_file() archive_file.is_downloaded = False archive_file.save() unpacked_sample_files.append(actual_file) except Exception as e: logger.debug( "Found a file we didn't have an OriginalFile for! Why did this happen?: " + og_file['filename'], exc_info=1, file=og_file['filename'], sample_id=sample_id, accession_code=accession_code) os.remove(og_file["absolute_path"]) # This is probably just a .txt file else: filename = dl_file_path.split('/')[-1] sample_id = filename.split('_')[0] actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = dl_file_path actual_file.filename = filename actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = True actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() for sample in related_samples: new_association = OriginalFileSampleAssociation() new_association.original_file = actual_file new_association.sample = sample new_association.save() unpacked_sample_files.append(actual_file) if len(unpacked_sample_files) > 0: success = True logger.debug("File downloaded and extracted successfully.", url=url, dl_file_path=dl_file_path, downloader_job=job_id) else: success = False logger.info("Unable to extract any files.", url=url, dl_file_path=dl_file_path, downloader_job=job_id) job.failure_reason = "Failed to extract any downloaded files." if success: utils.create_processor_jobs_for_original_files(unpacked_sample_files, job) if original_file.is_archive: original_file.delete_local_file() utils.end_downloader_job(job, success) return success
def download_geo(job_id: int) -> None: """The main function for the GEO Downloader. Downloads a single tar file containing the files representing samples relating to a single experiment stored in GEO. """ job = utils.start_job(job_id) accession_code = job.accession_code original_file = job.original_files.first() if not original_file: job.failure_reason = "No files associated with the job." logger.error("No files associated with the job.", downloader_job=job_id) utils.end_downloader_job(job, success=False) return url = original_file.source_url related_samples = original_file.samples.exclude(technology="RNA-SEQ") # First, download the sample archive URL. # Then, unpack all the ones downloaded. # Then create processor jobs! # The files for all of the samples are contained within the same zip file. Therefore only # download the one. os.makedirs(LOCAL_ROOT_DIR + "/" + accession_code, exist_ok=True) dl_file_path = LOCAL_ROOT_DIR + "/" + accession_code + "/" + url.split("/")[-1] logger.debug("Starting to download: " + url, job_id=job_id, accession_code=accession_code) _download_file(url, dl_file_path, job) original_file.absolute_file_path = dl_file_path original_file.is_downloaded = True original_file.save() unpacked_sample_files = [] try: # enumerate all files inside the archive archived_files = list(ArchivedFile(dl_file_path).get_files()) except FileExtractionError as e: job.failure_reason = e logger.exception( "Error occurred while extracting file.", path=dl_file_path, exception=str(e) ) utils.end_downloader_job(job, success=False) return for og_file in archived_files: sample = og_file.get_sample() # We don't want RNA-Seq data from GEO: # https://github.com/AlexsLemonade/refinebio/issues/966 if sample and sample.technology == "RNA-SEQ": logger.warn("RNA-Seq sample found in GEO downloader job.", sample=sample) continue if not sample and ( not og_file.is_processable() or og_file.experiment_accession_code() != accession_code ): # skip the files that we know are not processable and can't be associated with a sample # also skip the files were we couldn't find a sample and they don't mention the current experiment continue potential_existing_file = OriginalFile.objects.filter( source_filename=original_file.source_filename, filename=og_file.filename, is_archive=False, ).first() if potential_existing_file: # We've already created this record, let's see if we actually # needed to download it or if we just got it because we needed # a file in the same archive. if potential_existing_file.needs_processing(): if not potential_existing_file.is_downloaded: potential_existing_file.is_downloaded = True potential_existing_file.save() unpacked_sample_files.append(potential_existing_file) continue # Then this is a new file and we should create an original file for it actual_file = OriginalFile() actual_file.is_downloaded = True actual_file.is_archive = False actual_file.absolute_file_path = og_file.file_path actual_file.filename = og_file.filename actual_file.calculate_size() actual_file.calculate_sha1() actual_file.has_raw = True actual_file.source_url = original_file.source_url actual_file.source_filename = original_file.source_filename actual_file.save() # try to see if the file should be associated with a sample if sample: original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = actual_file original_file_sample_association.save() else: # if not, we can associate this file with all samples in the experiment for sample in related_samples: original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = actual_file original_file_sample_association.save() unpacked_sample_files.append(actual_file) if len(unpacked_sample_files) > 0: success = True logger.debug( "File downloaded and extracted successfully.", url=url, dl_file_path=dl_file_path, downloader_job=job_id, ) else: success = False logger.info( "Unable to extract any files.", url=url, dl_file_path=dl_file_path, downloader_job=job_id, ) job.failure_reason = "Failed to extract any downloaded files." if success: create_processor_jobs_for_original_files(unpacked_sample_files, job) if original_file.is_archive: original_file.delete_local_file() utils.end_downloader_job(job, success) return success
def test_no_repeat_jobs(self): """Make sure that queue_downloader_jobs queues all expected Downloader jobs for a given experiment. """ # First, create an experiment with two samples associated with it # and create two original files for each of those samples. experiment_object = Experiment() experiment_object.accession_code = "Experiment1" experiment_object.save() sample_object = Sample() sample_object.accession_code = "Sample1" sample_object.platform_accession_code = "Illumina Genome Analyzer" sample_object.platform_accession_name = "Illumina Genome Analyzer" sample_object.technology = "RNA-SEQ" sample_object.manufacturer = "ILLUMINA" sample_object.source_database = "SRA" sample_object.save() original_file_1 = OriginalFile() original_file_1.source_url = "first_url" original_file_1.source_filename = "first_filename" original_file_1.is_downloaded = False original_file_1.has_raw = True original_file_1.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file_1 original_file_sample_association.sample = sample_object original_file_sample_association.save() original_file_2 = OriginalFile() original_file_2.source_url = "second_url" original_file_2.source_filename = "second_filename" original_file_2.is_downloaded = False original_file_2.has_raw = True original_file_2.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file_2 original_file_sample_association.sample = sample_object original_file_sample_association.save() dlj = DownloaderJob() dlj.save() DownloaderJobOriginalFileAssociation( downloader_job=dlj, original_file=original_file_1 ).save() DownloaderJobOriginalFileAssociation( downloader_job=dlj, original_file=original_file_2 ).save() survey_job = SurveyJob(source_type="SRA") survey_job.save() surveyor = SraSurveyor(survey_job) surveyor.queue_downloader_job_for_original_files( [original_file_1, original_file_2], experiment_object.accession_code ) # We made one DownloaderJob in this test, so # queue_downloader_job_for_original_files didn't have anything # to do, so there should still be only one: self.assertEqual(1, DownloaderJob.objects.all().count())
def test_queue_downloader_jobs_for_original_files(self, mock_send_task): """Make sure that queue_downloader_jobs queues all expected Downloader jobs for a given experiment. """ # First, create an experiment with two samples associated with it # and create two original files for each of those samples. experiment_object = Experiment() experiment_object.accession_code = "Experiment1" experiment_object.save() sample_object_1 = Sample() sample_object_1.accession_code = "Sample1" sample_object_1.platform_accession_code = "Illumina Genome Analyzer" sample_object_1.platform_accession_name = "Illumina Genome Analyzer" sample_object_1.technology = "RNA-SEQ" sample_object_1.manufacturer = "ILLUMINA" sample_object_1.source_database = "SRA" sample_object_1.save() sample_object_2 = Sample() sample_object_2.accession_code = "Sample2" sample_object_2.platform_accession_code = "Illumina Genome Analyzer" sample_object_2.platform_accession_name = "Illumina Genome Analyzer" sample_object_2.technology = "RNA-SEQ" sample_object_2.manufacturer = "ILLUMINA" sample_object_2.source_database = "SRA" sample_object_2.save() association = ExperimentSampleAssociation() association.experiment = experiment_object association.sample = sample_object_1 association.save() association = ExperimentSampleAssociation() association.experiment = experiment_object association.sample = sample_object_2 association.save() sample_1_original_files = [] sample_2_original_files = [] original_file = OriginalFile() original_file.source_url = "first_url" original_file.source_filename = "first_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_1_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_1 original_file_sample_association.save() original_file = OriginalFile() original_file.source_url = "second_url" original_file.source_filename = "second_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_2_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_1 original_file_sample_association.save() original_file = OriginalFile() original_file.source_url = "third_url" original_file.source_filename = "third_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_2_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_2 original_file_sample_association.save() original_file = OriginalFile() original_file.source_url = "fourth_url" original_file.source_filename = "fourth_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_2_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_2 original_file_sample_association.save() survey_job = SurveyJob(source_type="SRA") survey_job.save() surveyor = SraSurveyor(survey_job) surveyor.queue_downloader_job_for_original_files( sample_1_original_files, experiment_object.accession_code ) surveyor.queue_downloader_job_for_original_files( sample_2_original_files, experiment_object.accession_code ) self.assertEqual(DownloaderJob.objects.all().count(), 2)
def test_dharma(self): dlj1 = DownloaderJob() dlj1.accession_code = 'D1' dlj1.worker_id = get_instance_id() dlj1.start_time = datetime.datetime.now() dlj1.save() dlj2 = DownloaderJob() dlj2.accession_code = 'D2' dlj2.worker_id = get_instance_id() dlj2.start_time = datetime.datetime.now() dlj2.save() dlj3 = DownloaderJob() dlj3.accession_code = 'D3' dlj3.worker_id = get_instance_id() dlj3.save() original_file = OriginalFile() original_file.source_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MEXP/E-MEXP-433/E-MEXP-433.raw.1.zip" original_file.source_filename = "Waldhof_020604_R30_01-2753_U133A.CEL" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj3 assoc.save() sample = Sample() sample.accession_code = 'Blahblahblah' sample.technology = "MICROARRAY" sample.manufacturer = "AFFYMETRIX" sample.has_raw = True sample.platform_accession_code = "hgu133a" sample.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample, original_file=original_file) exited = False try: utils.start_job(dlj3.id, max_downloader_jobs_per_node=2, force_harakiri=True) except SystemExit as e: # This is supposed to happen! self.assertTrue(True) exited = True except Exception as e: # This isn't! self.assertTrue(False) self.assertTrue(exited) exited = False try: utils.start_job(dlj3.id, max_downloader_jobs_per_node=15, force_harakiri=True) except SystemExit as e: # This is not supposed to happen! self.assertTrue(False) exited = True except Exception as e: # This is! self.assertTrue(True) self.assertFalse(exited)