def init_objects(): survey_job = SurveyJob(source_type="ARRAY_EXPRESS") survey_job.save() batch = Batch( survey_job=survey_job, source_type="ARRAY_EXPRESS", pipeline_required="AFFY_TO_PCL", platform_accession_code="A-AFFY-1", experiment_accession_code="E-MTAB-3050", experiment_title="It doesn't really matter.", organism_id=9606, organism_name="H**O SAPIENS", release_date="2017-05-05", last_uploaded_date="2017-05-05", status=BatchStatuses.DOWNLOADED.value ) batch.save() file = File(size_in_bytes=0, download_url="ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip", # noqa raw_format="CEL", processed_format="PCL", name="CE1234.CEL", internal_location="A-AFFY-1/AFFY_TO_PCL/", batch=batch) file.save() batch.files = [file] return batch
def test_multiple_batches(self, mock_download_file): # Just in case this test ever breaks, we don't actually want # to download the file because that'll take a while to fail. mock_download_file.return_value = True batch, _ = self.insert_objects() batch2 = Batch(survey_job=self.survey_job, source_type="SRA", pipeline_required="SALMON", platform_accession_code="IlluminaHiSeq2000", experiment_accession_code="DRX001564", experiment_title="It doesn't really matter.", organism_id=9031, organism_name="GALLUS GALLUS", release_date="2013-07-19", last_uploaded_date="2017-09-11", status=BatchStatuses.NEW.value) batch2.save() downloader_job = DownloaderJob.create_job_and_relationships( batches=[batch, batch2], downloader_task="dummy") downloader_job.save() sra.download_sra(downloader_job.id) completed_job = DownloaderJob.objects.get(id=downloader_job.id) self.assertFalse(completed_job.success) self.assertEqual(completed_job.failure_reason, ("More than one batch found for SRA downloader job. " "There should only be one."))
def setUpClass(cls): survey_job = SurveyJob( source_type="ARRAY_EXPRESS" ) survey_job.save() batch = Batch( survey_job=survey_job, source_type="ARRAY_EXPRESS", pipeline_required="AFFY_TO_PCL", platform_accession_code="A-AFFY-141", experiment_accession_code="E-GEOD-59071", experiment_title="It doesn't really matter.", organism_id=9606, organism_name="H**O SAPIENS", release_date="2017-05-05", last_uploaded_date="2017-05-05", status=BatchStatuses.NEW.value ) batch.save() file = File( size_in_bytes=0, download_url="ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip", # noqa raw_format="CEL", processed_format="PCL", name="GSM1426072.CEL", internal_location="A-AFFY-141/AFFY_TO_PCL", batch=batch ) file.save() super(FilesTestCase, cls).setUpClass()
def _insert_salmon_index(): """Creates a batch for the index for the organism for the test.""" survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() batch = Batch(survey_job=survey_job, source_type="TRANSCRIPTOME_INDEX", pipeline_required="TRANSCRIPTOME_INDEX", platform_accession_code="TEST", experiment_accession_code="HOMO_SAPIENS", experiment_title="It doesn't really matter.", organism_id=9606, organism_name="H**O SAPIENS", release_date="2017-11-02", last_uploaded_date="2017-11-02", status=BatchStatuses.PROCESSED.value) batch.save() kmer_size = BatchKeyValue(key="kmer_size", value="23", batch=batch) kmer_size.save() index_file = File( size_in_bytes=2214725074, raw_format="gtf.gz", processed_format="tar.gz", name="Homo_sapiens_short.gtf.gz", internal_location="TEST/TRANSCRIPTOME_INDEX", download_url=("ftp://ftp.ensembl.org/pub/release-90/gtf/homo_sapiens" "/Homo_sapiens.GRCh38.90.gtf.gz"), batch=batch) index_file.save()
def get_batch(): survey_job = SurveyJob(source_type="ARRAY_EXPRESS") survey_job.save() batch = Batch(survey_job=survey_job, source_type="ARRAY_EXPRESS", pipeline_required="AFFY_TO_PCL", platform_accession_code="A-AFFY-1", experiment_accession_code="E-MTAB-3050", experiment_title="It doesn't really matter.", organism_id=9606, organism_name="H**O SAPIENS", release_date="2017-05-05", last_uploaded_date="2017-05-05", status=BatchStatuses.NEW.value) batch.save() File(size_in_bytes=0, download_url="example.com", raw_format="CEL", processed_format="PCL", name="CE1234.CEL", internal_location="A-AFFY-1/AFFY_TO_PCL/", batch=batch).save() return batch
def handle(self, *args, **options): # Create all the dummy data that would have been created # before a downloader job could have been generated. survey_job = SurveyJob(source_type="ARRAY_EXPRESS") survey_job.save() batch = Batch(survey_job=survey_job, source_type="ARRAY_EXPRESS", pipeline_required="AFFY_TO_PCL", platform_accession_code="A-AFFY-141", experiment_accession_code="E-GEOD-59071", experiment_title="It doesn't really matter.", organism_id=9606, organism_name="H**O SAPIENS", release_date="2017-05-05", last_uploaded_date="2017-05-05", status=BatchStatuses.NEW.value) batch.save() file = File( batch=batch, size_in_bytes=0, download_url= "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip", # noqa raw_format="CEL", processed_format="PCL", name="GSM1426072_CD_colon_active_2.CEL", internal_location="A-AFFY-141/AFFY_TO_PCL") file.save() downloader_job = DownloaderJob.create_job_and_relationships( batches=[batch]) send_job(Downloaders["ARRAY_EXPRESS"], downloader_job.id)
def test_aspera_downloader(self): """ """ batch = Batch(survey_job=self.survey_job, source_type="SRA", pipeline_required="SALMON", platform_accession_code="IlluminaHiSeq2000", experiment_accession_code="DRX001563", experiment_title="It doesn't really matter.", organism_id=9031, organism_name="GALLUS GALLUS", release_date="2013-07-19", last_uploaded_date="2017-09-11", status=BatchStatuses.NEW.value) batch.save() # This is converted from FTP URL to use Aspera file = File( size_in_bytes=0, download_url= "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz", # noqa raw_format="fastq.gz", processed_format="tar.gz", name="ERR036000_1.fastq.gz", internal_location="IlluminaHiSeq2000/SALMON", batch=batch) dj = DownloaderJob() self.assertTrue(sra._download_file(file, dj, file.name))
def run_trasnscriptome_processor(self): # Create all the dummy data that would have been created # before a processor job could have been generated. survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() batch = Batch( survey_job=survey_job, source_type="TRANSCRIPTOME_INDEX", pipeline_required="TRANSCRIPTOME_INDEX", platform_accession_code="EnsemblPlants", experiment_accession_code="aegilops_tauschii", experiment_title="It doesn't really matter.", organism_id=37682, organism_name="AEGILOPS TAUSCHII", release_date="2017-11-02", last_uploaded_date="2017-11-02", status=BatchStatuses.DOWNLOADED.value, ) batch.save() kmer_size_property = BatchKeyValue(batch=batch, key="kmer_size", value="31") kmer_size_property.save() gtf_file = File( name="aegilops_tauschii_short.gtf.gz", download_url=( "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/gtf" "/aegilops_tauschii/Aegilops_tauschii.ASM34733v1.37.gtf.gz"), raw_format="gtf.gz", processed_format="tar.gz", internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX", size_in_bytes=-1, batch=batch) gtf_file.save() fasta_file = File( name="aegilops_tauschii_short.fa.gz", download_url=( "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/fasta" "/aegilops_tauschii/dna/Aegilops_tauschii." "ASM34733v1.dna.toplevel.fa.gz"), raw_format="fa.gz", processed_format="tar.gz", internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX", size_in_bytes=-1, batch=batch) fasta_file.save() processor_job = ProcessorJob.create_job_and_relationships( batches=[batch]) logger.info("Queuing a processor job.") send_job(ProcessorPipeline[batch.pipeline_required], processor_job.id)
def test_survey(self): survey_job = SurveyJob(source_type="ARRAY_EXPRESS") surveyor = ArrayExpressSurveyor(survey_job) file1 = File(download_url="a") file2 = File(download_url="a") file3 = File(download_url="b") file4 = File(download_url="a") batch1 = Batch(files=[file1]) batch2 = Batch(files=[file2]) batch3 = Batch(files=[file3, file4]) surveyor.batches = [batch1, batch2, batch3] groups = surveyor.group_batches() self.assertEqual(groups, [[batch1, batch2], [batch3]])
def init_objects(): survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() batch = Batch(survey_job=survey_job, source_type="TRANSCRIPTOME_INDEX", pipeline_required="TRANSCRIPTOME_INDEX", platform_accession_code="EnsemblPlants", experiment_accession_code="aegilops_tauschii", experiment_title="It doesn't really matter.", organism_id=37682, organism_name="AEGILOPS TAUSCHII", release_date="2017-11-02", last_uploaded_date="2017-11-02", status=BatchStatuses.DOWNLOADED.value) batch.save() BatchKeyValue(batch=batch, key="length", value="_short").save() BatchKeyValue(batch=batch, key="kmer_size", value="23").save() gtf_file = File( size_in_bytes=-1, raw_format="gtf.gz", processed_format="tar.gz", name="aegilops_tauschii_short.gtf.gz", internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX", download_url=( "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/gtf" "/aegilops_tauschii/Aegilops_tauschii.ASM34733v1.37.gtf.gz"), batch=batch) gtf_file.save() fasta_file = File( size_in_bytes=-1, raw_format="fa.gz", processed_format="tar.gz", name="aegilops_tauschii_short.fa.gz", internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX", download_url=( "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/fasta" "/aegilops_tauschii/dna/Aegilops_tauschii." "ASM34733v1.dna.toplevel.fa.gz"), batch=batch) fasta_file.save() batch.files = [gtf_file, fasta_file] return (batch, gtf_file, fasta_file)
def run_sra_processor(self): # Create all the dummy data that would have been created # before a processor job could have been generated. survey_job = SurveyJob(source_type="SRA") survey_job.save() batch = Batch( survey_job=survey_job, source_type="SRA", pipeline_required="SALMON", platform_accession_code="IlluminaHiSeq2500", experiment_accession_code="PRJEB5018", experiment_title="It doesn't really matter.", organism_id=10090, organism_name="MUS MUSCULUS", release_date="2014-03-25", last_uploaded_date="2016-05-20", status=BatchStatuses.NEW.value, ) batch.save() File(name="ERR1680082_1.fastq", download_url=("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR168/002/" "ERR1680082/ERR1680082_1.fastq.gz"), raw_format="fastq", processed_format="sf", internal_location="IlluminaHiSeq2500/SALMON", size_in_bytes=2214725074, batch=batch).save() File(name="ERR1680082_2.fastq", download_url=("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR168/002/" "ERR1680082/ERR1680082_2.fastq.gz"), raw_format="fastq", processed_format="sf", internal_location="IlluminaHiSeq2500/SALMON", size_in_bytes=2214725074, batch=batch).save() processor_job = ProcessorJob.create_job_and_relationships( batches=[batch]) logger.info("Queuing a processor job.") send_job(ProcessorPipeline[batch.pipeline_required], processor_job.id)
def insert_objects(self) -> List[Batch]: download_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip" # noqa batch1 = Batch(survey_job=self.survey_job, source_type="TRANSCRIPTOME_INDEX", pipeline_required="TRANSCRIPTOME_INDEX", platform_accession_code="EnsemblPlants", experiment_accession_code="AEGILOPS_TAUSCHII", experiment_title="It doesn't really matter.", organism_id=37682, organism_name="AEGILOPS TAUSCHII", release_date="2017-05-05", last_uploaded_date="2017-05-05", status=BatchStatuses.NEW.value) batch2 = copy.deepcopy(batch1) batch1.save() batch2.save() for batch, length, kmer_size in [(batch1, "_short", "23"), (batch2, "_long", "31")]: BatchKeyValue(batch=batch, key="length", value=length).save() BatchKeyValue(batch=batch, key="kmer_size", value=kmer_size).save() file1 = File(size_in_bytes=0, download_url=self.fasta_download_url, raw_format="fa.gz", processed_format="tar.gz", name="Aegilops_tauschii{}.fa.gz".format(length), internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX", batch=batch) file2 = File(size_in_bytes=0, download_url=self.gtf_download_url, raw_format="gtf.gz", processed_format="tar.gz", name="Aegilops_tauschii{}.gtf.gz".format(length), internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX", batch=batch) file1.save() file2.save() batch.files = [file1, file2] return [batch1, batch2]
def init_objects(): survey_job = SurveyJob(source_type="SALMON") survey_job.save() batch = Batch(survey_job=survey_job, source_type="SALMON", pipeline_required="SALMON", platform_accession_code="IlluminaGenomeAnalyzerII", experiment_accession_code="ERX000259", experiment_title="It doesn't really matter.", organism_id=9606, organism_name="H**O SAPIENS", release_date="2017-11-02", last_uploaded_date="2017-11-02", status=BatchStatuses.DOWNLOADED.value) batch.save() first_fastq_file = File( size_in_bytes=2214725074, raw_format="fastq.gz", processed_format="tar.gz", name="ERR003000_1.fastq.gz", internal_location="IlluminaGenomeAnalyzerII/SALMON", download_url=("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR003/" "ERR003000/ERR003000_1.fastq.gz"), batch=batch) first_fastq_file.save() second_fastq_file = File( size_in_bytes=2214725074, raw_format="fastq.gz", processed_format="tar.gz", name="ERR003000_2.fastq.gz", internal_location="IlluminaGenomeAnalyzerII/SALMON", download_url=("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR003/" "ERR003000/ERR003000_2.fastq.gz"), batch=batch) second_fastq_file.save() batch.files = [first_fastq_file, second_fastq_file] return (batch, first_fastq_file, second_fastq_file)
def insert_objects(self) -> List[Batch]: download_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip" # noqa batch = Batch(survey_job=self.survey_job, source_type="ARRAY_EXPRESS", pipeline_required="AFFY_TO_PCL", platform_accession_code="A-AFFY-1", experiment_accession_code="E-MTAB-3050", experiment_title="It doesn't really matter.", organism_id=9606, organism_name="H**O SAPIENS", release_date="2017-05-05", last_uploaded_date="2017-05-05", status=BatchStatuses.NEW.value) batch2 = copy.deepcopy(batch) batch.save() batch2.save() file = File(size_in_bytes=0, download_url=download_url, raw_format="CEL", processed_format="PCL", name="CE1234.CEL", internal_location="A-AFFY-1/AFFY_TO_PCL/", batch=batch) file2 = File(size_in_bytes=0, download_url=download_url, raw_format="CEL", processed_format="PCL", name="CE2345.CEL", internal_location="A-AFFY-1/AFFY_TO_PCL/", batch=batch2) file.save() file2.save() batch.files = [file] batch2.files = [file] return ([batch, batch2], [file, file2])
def insert_objects(self) -> List[Batch]: download_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip" # noqa batch = Batch(survey_job=self.survey_job, source_type="SRA", pipeline_required="SALMON", platform_accession_code="IlluminaHiSeq2000", experiment_accession_code="DRX001563", experiment_title="It doesn't really matter.", organism_id=9031, organism_name="GALLUS GALLUS", release_date="2013-07-19", last_uploaded_date="2017-09-11", status=BatchStatuses.NEW.value) batch.save() file = File( size_in_bytes=0, download_url= "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/DRR002/DRR002116/DRR002116_1.fastq.gz", # noqa raw_format="fastq.gz", processed_format="tar.gz", name="DRR002116_1.fastq.gz", internal_location="IlluminaHiSeq2000/SALMON", batch=batch) file2 = File( size_in_bytes=0, download_url= "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/DRR002/DRR002116/DRR002116_2.fastq.gz", # noqa raw_format="fastq.gz", processed_format="tar.gz", name="DRR002116_2.fastq.gz", internal_location="IlluminaHiSeq2000/SALMON", batch=batch) file.save() file2.save() batch.files = [file, file2] return (batch, [file, file2])
def add_batch(self, platform_accession_code: str, experiment_accession_code: str, organism_id: int, organism_name: str, experiment_title: str, release_date, last_uploaded_date, files: List[File], key_values: Dict = {}): # Prevent creating duplicate Batches. for file in files: if File.objects.filter(name=file.name).count() != 0: logger.info(( "Skipping sample with name %s because a File already exists with " "that name."), file.name) return batch = Batch(survey_job=self.survey_job, source_type=self.source_type(), status=BatchStatuses.NEW.value, platform_accession_code=platform_accession_code, experiment_accession_code=experiment_accession_code, organism_id=organism_id, organism_name=organism_name, experiment_title=experiment_title, release_date=release_date, last_uploaded_date=last_uploaded_date) batch.files = files batch.pipeline_required = self.determine_pipeline(batch, key_values).value batch.save() for file in files: file.internal_location = os.path.join( batch.platform_accession_code, batch.pipeline_required) file.batch = batch file.save() for key, value in key_values.items(): BatchKeyValue(batch=batch, key=key, value=value).save() self.batches.append(batch)