def setUpClass(cls): survey_job = SurveyJob( source_type="ARRAY_EXPRESS" ) survey_job.save() batch = Batch( survey_job=survey_job, source_type="ARRAY_EXPRESS", pipeline_required="AFFY_TO_PCL", platform_accession_code="A-AFFY-141", experiment_accession_code="E-GEOD-59071", experiment_title="It doesn't really matter.", organism_id=9606, organism_name="H**O SAPIENS", release_date="2017-05-05", last_uploaded_date="2017-05-05", status=BatchStatuses.NEW.value ) batch.save() file = File( size_in_bytes=0, download_url="ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip", # noqa raw_format="CEL", processed_format="PCL", name="GSM1426072.CEL", internal_location="A-AFFY-141/AFFY_TO_PCL", batch=batch ) file.save() super(FilesTestCase, cls).setUpClass()
def init_objects(): survey_job = SurveyJob(source_type="ARRAY_EXPRESS") survey_job.save() batch = Batch( survey_job=survey_job, source_type="ARRAY_EXPRESS", pipeline_required="AFFY_TO_PCL", platform_accession_code="A-AFFY-1", experiment_accession_code="E-MTAB-3050", experiment_title="It doesn't really matter.", organism_id=9606, organism_name="H**O SAPIENS", release_date="2017-05-05", last_uploaded_date="2017-05-05", status=BatchStatuses.DOWNLOADED.value ) batch.save() file = File(size_in_bytes=0, download_url="ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip", # noqa raw_format="CEL", processed_format="PCL", name="CE1234.CEL", internal_location="A-AFFY-1/AFFY_TO_PCL/", batch=batch) file.save() batch.files = [file] return batch
def handle(self, *args, **options): # Create all the dummy data that would have been created # before a downloader job could have been generated. survey_job = SurveyJob(source_type="ARRAY_EXPRESS") survey_job.save() batch = Batch(survey_job=survey_job, source_type="ARRAY_EXPRESS", pipeline_required="AFFY_TO_PCL", platform_accession_code="A-AFFY-141", experiment_accession_code="E-GEOD-59071", experiment_title="It doesn't really matter.", organism_id=9606, organism_name="H**O SAPIENS", release_date="2017-05-05", last_uploaded_date="2017-05-05", status=BatchStatuses.NEW.value) batch.save() file = File( batch=batch, size_in_bytes=0, download_url= "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip", # noqa raw_format="CEL", processed_format="PCL", name="GSM1426072_CD_colon_active_2.CEL", internal_location="A-AFFY-141/AFFY_TO_PCL") file.save() downloader_job = DownloaderJob.create_job_and_relationships( batches=[batch]) send_job(Downloaders["ARRAY_EXPRESS"], downloader_job.id)
def _insert_salmon_index(): """Creates a batch for the index for the organism for the test.""" survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() batch = Batch(survey_job=survey_job, source_type="TRANSCRIPTOME_INDEX", pipeline_required="TRANSCRIPTOME_INDEX", platform_accession_code="TEST", experiment_accession_code="HOMO_SAPIENS", experiment_title="It doesn't really matter.", organism_id=9606, organism_name="H**O SAPIENS", release_date="2017-11-02", last_uploaded_date="2017-11-02", status=BatchStatuses.PROCESSED.value) batch.save() kmer_size = BatchKeyValue(key="kmer_size", value="23", batch=batch) kmer_size.save() index_file = File( size_in_bytes=2214725074, raw_format="gtf.gz", processed_format="tar.gz", name="Homo_sapiens_short.gtf.gz", internal_location="TEST/TRANSCRIPTOME_INDEX", download_url=("ftp://ftp.ensembl.org/pub/release-90/gtf/homo_sapiens" "/Homo_sapiens.GRCh38.90.gtf.gz"), batch=batch) index_file.save()
def test_good_file_grouping(self): """Returns None if both files have the same download_url.""" downloader_job = DownloaderJob.create_job_and_relationships( batches=[], downloader_task="dummy") self.assertIsNone( transcriptome_index._verify_files(File(download_url="a"), File(download_url="a"), downloader_job))
def test_bad_file_grouping(self): """Raises exception if both files don't have the same download_url.""" downloader_job = DownloaderJob.create_job_and_relationships( batches=[], downloader_task="dummy") with self.assertRaises(ValueError): self.assertIsNone( transcriptome_index._verify_files(File(download_url="a"), File(download_url="b"), downloader_job))
def test_survey(self): survey_job = SurveyJob(source_type="ARRAY_EXPRESS") surveyor = ArrayExpressSurveyor(survey_job) file1 = File(download_url="a") file2 = File(download_url="a") file3 = File(download_url="b") file4 = File(download_url="a") batch1 = Batch(files=[file1]) batch2 = Batch(files=[file2]) batch3 = Batch(files=[file3, file4]) surveyor.batches = [batch1, batch2, batch3] groups = surveyor.group_batches() self.assertEqual(groups, [[batch1, batch2], [batch3]])
def get_batch(): survey_job = SurveyJob(source_type="ARRAY_EXPRESS") survey_job.save() batch = Batch(survey_job=survey_job, source_type="ARRAY_EXPRESS", pipeline_required="AFFY_TO_PCL", platform_accession_code="A-AFFY-1", experiment_accession_code="E-MTAB-3050", experiment_title="It doesn't really matter.", organism_id=9606, organism_name="H**O SAPIENS", release_date="2017-05-05", last_uploaded_date="2017-05-05", status=BatchStatuses.NEW.value) batch.save() File(size_in_bytes=0, download_url="example.com", raw_format="CEL", processed_format="PCL", name="CE1234.CEL", internal_location="A-AFFY-1/AFFY_TO_PCL/", batch=batch).save() return batch
def test_aspera_downloader(self): """ """ batch = Batch(survey_job=self.survey_job, source_type="SRA", pipeline_required="SALMON", platform_accession_code="IlluminaHiSeq2000", experiment_accession_code="DRX001563", experiment_title="It doesn't really matter.", organism_id=9031, organism_name="GALLUS GALLUS", release_date="2013-07-19", last_uploaded_date="2017-09-11", status=BatchStatuses.NEW.value) batch.save() # This is converted from FTP URL to use Aspera file = File( size_in_bytes=0, download_url= "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz", # noqa raw_format="fastq.gz", processed_format="tar.gz", name="ERR036000_1.fastq.gz", internal_location="IlluminaHiSeq2000/SALMON", batch=batch) dj = DownloaderJob() self.assertTrue(sra._download_file(file, dj, file.name))
def _download_file(file: File, downloader_job: DownloaderJob, target_file_path: str, force_ftp: bool = False) -> bool: """ Download file dispatcher. Dispatches to the FTP or Aspera downloader """ # SRA files have Apsera downloads. if 'ftp.sra.ebi.ac.uk' in file.download_url and not force_ftp: # From: ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz # To: [email protected]:/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz file.download_url = file.download_url.replace('ftp://', 'era-fasp@') file.download_url = file.download_url.replace('ftp', 'fasp') file.download_url = file.download_url.replace('.uk/', '.uk:/') return _download_file_aspera(file, downloader_job, target_file_path) else: return _download_file_ftp(file, downloader_job, target_file_path)
def run_trasnscriptome_processor(self): # Create all the dummy data that would have been created # before a processor job could have been generated. survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() batch = Batch( survey_job=survey_job, source_type="TRANSCRIPTOME_INDEX", pipeline_required="TRANSCRIPTOME_INDEX", platform_accession_code="EnsemblPlants", experiment_accession_code="aegilops_tauschii", experiment_title="It doesn't really matter.", organism_id=37682, organism_name="AEGILOPS TAUSCHII", release_date="2017-11-02", last_uploaded_date="2017-11-02", status=BatchStatuses.DOWNLOADED.value, ) batch.save() kmer_size_property = BatchKeyValue(batch=batch, key="kmer_size", value="31") kmer_size_property.save() gtf_file = File( name="aegilops_tauschii_short.gtf.gz", download_url=( "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/gtf" "/aegilops_tauschii/Aegilops_tauschii.ASM34733v1.37.gtf.gz"), raw_format="gtf.gz", processed_format="tar.gz", internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX", size_in_bytes=-1, batch=batch) gtf_file.save() fasta_file = File( name="aegilops_tauschii_short.fa.gz", download_url=( "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/fasta" "/aegilops_tauschii/dna/Aegilops_tauschii." "ASM34733v1.dna.toplevel.fa.gz"), raw_format="fa.gz", processed_format="tar.gz", internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX", size_in_bytes=-1, batch=batch) fasta_file.save() processor_job = ProcessorJob.create_job_and_relationships( batches=[batch]) logger.info("Queuing a processor job.") send_job(ProcessorPipeline[batch.pipeline_required], processor_job.id)
def run_sra_processor(self): # Create all the dummy data that would have been created # before a processor job could have been generated. survey_job = SurveyJob(source_type="SRA") survey_job.save() batch = Batch( survey_job=survey_job, source_type="SRA", pipeline_required="SALMON", platform_accession_code="IlluminaHiSeq2500", experiment_accession_code="PRJEB5018", experiment_title="It doesn't really matter.", organism_id=10090, organism_name="MUS MUSCULUS", release_date="2014-03-25", last_uploaded_date="2016-05-20", status=BatchStatuses.NEW.value, ) batch.save() File(name="ERR1680082_1.fastq", download_url=("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR168/002/" "ERR1680082/ERR1680082_1.fastq.gz"), raw_format="fastq", processed_format="sf", internal_location="IlluminaHiSeq2500/SALMON", size_in_bytes=2214725074, batch=batch).save() File(name="ERR1680082_2.fastq", download_url=("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR168/002/" "ERR1680082/ERR1680082_2.fastq.gz"), raw_format="fastq", processed_format="sf", internal_location="IlluminaHiSeq2500/SALMON", size_in_bytes=2214725074, batch=batch).save() processor_job = ProcessorJob.create_job_and_relationships( batches=[batch]) logger.info("Queuing a processor job.") send_job(ProcessorPipeline[batch.pipeline_required], processor_job.id)
def _generate_batch(self, species: Dict) -> None: url_builder = ensembl_url_builder_factory(species) fasta_download_url = url_builder.build_transcriptome_url() gtf_download_url = url_builder.build_gtf_url() current_time = timezone.now() platform_accession_code = species.pop("division") self._clean_metadata(species) for length in ("_long", "_short"): fasta_file_name = url_builder.file_name_species + length + ".fa.gz" fasta_file = File(name=fasta_file_name, download_url=fasta_download_url, raw_format="fa.gz", processed_format="tar.gz", size_in_bytes=-1) # Will have to be determined later gtf_file_name = url_builder.file_name_species + length + ".gtf.gz" gtf_file = File(name=gtf_file_name, download_url=gtf_download_url, raw_format="gtf.gz", processed_format="tar.gz", size_in_bytes=-1) # Will have to be determined later # Add a couple extra key/value pairs to the Batch. species["length"] = length species["kmer_size"] = "31" if length == "_long" else "23" self.add_batch(platform_accession_code=platform_accession_code, experiment_accession_code=url_builder.file_name_species.upper(), organism_id=url_builder.taxonomy_id, organism_name=url_builder.scientific_name, experiment_title="NA", release_date=current_time, last_uploaded_date=current_time, files=[fasta_file, gtf_file], # Store the rest of the metadata about these! key_values=species)
def init_objects(): survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() batch = Batch(survey_job=survey_job, source_type="TRANSCRIPTOME_INDEX", pipeline_required="TRANSCRIPTOME_INDEX", platform_accession_code="EnsemblPlants", experiment_accession_code="aegilops_tauschii", experiment_title="It doesn't really matter.", organism_id=37682, organism_name="AEGILOPS TAUSCHII", release_date="2017-11-02", last_uploaded_date="2017-11-02", status=BatchStatuses.DOWNLOADED.value) batch.save() BatchKeyValue(batch=batch, key="length", value="_short").save() BatchKeyValue(batch=batch, key="kmer_size", value="23").save() gtf_file = File( size_in_bytes=-1, raw_format="gtf.gz", processed_format="tar.gz", name="aegilops_tauschii_short.gtf.gz", internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX", download_url=( "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/gtf" "/aegilops_tauschii/Aegilops_tauschii.ASM34733v1.37.gtf.gz"), batch=batch) gtf_file.save() fasta_file = File( size_in_bytes=-1, raw_format="fa.gz", processed_format="tar.gz", name="aegilops_tauschii_short.fa.gz", internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX", download_url=( "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/fasta" "/aegilops_tauschii/dna/Aegilops_tauschii." "ASM34733v1.dna.toplevel.fa.gz"), batch=batch) fasta_file.save() batch.files = [gtf_file, fasta_file] return (batch, gtf_file, fasta_file)
def _build_file(run_accession: str, read_suffix="") -> File: # ENA has a weird way of nesting data: if the run accession is # greater than 9 characters long then there is an extra # sub-directory in the path which is "00" + the last digit of # the run accession. sub_dir = "" if len(run_accession) > 9: sub_dir = ENA_SUB_DIR_PREFIX + run_accession[-1] return File(name=(run_accession + read_suffix + ".fastq.gz"), download_url=ENA_DOWNLOAD_URL_TEMPLATE.format( short_accession=run_accession[:6], sub_dir=sub_dir, long_accession=run_accession, read_suffix=read_suffix), raw_format="fastq.gz", processed_format="tar.gz", size_in_bytes=-1) # Will have to be determined later
def init_objects(): survey_job = SurveyJob(source_type="SALMON") survey_job.save() batch = Batch(survey_job=survey_job, source_type="SALMON", pipeline_required="SALMON", platform_accession_code="IlluminaGenomeAnalyzerII", experiment_accession_code="ERX000259", experiment_title="It doesn't really matter.", organism_id=9606, organism_name="H**O SAPIENS", release_date="2017-11-02", last_uploaded_date="2017-11-02", status=BatchStatuses.DOWNLOADED.value) batch.save() first_fastq_file = File( size_in_bytes=2214725074, raw_format="fastq.gz", processed_format="tar.gz", name="ERR003000_1.fastq.gz", internal_location="IlluminaGenomeAnalyzerII/SALMON", download_url=("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR003/" "ERR003000/ERR003000_1.fastq.gz"), batch=batch) first_fastq_file.save() second_fastq_file = File( size_in_bytes=2214725074, raw_format="fastq.gz", processed_format="tar.gz", name="ERR003000_2.fastq.gz", internal_location="IlluminaGenomeAnalyzerII/SALMON", download_url=("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR003/" "ERR003000/ERR003000_2.fastq.gz"), batch=batch) second_fastq_file.save() batch.files = [first_fastq_file, second_fastq_file] return (batch, first_fastq_file, second_fastq_file)
def insert_objects(self) -> List[Batch]: download_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip" # noqa batch1 = Batch(survey_job=self.survey_job, source_type="TRANSCRIPTOME_INDEX", pipeline_required="TRANSCRIPTOME_INDEX", platform_accession_code="EnsemblPlants", experiment_accession_code="AEGILOPS_TAUSCHII", experiment_title="It doesn't really matter.", organism_id=37682, organism_name="AEGILOPS TAUSCHII", release_date="2017-05-05", last_uploaded_date="2017-05-05", status=BatchStatuses.NEW.value) batch2 = copy.deepcopy(batch1) batch1.save() batch2.save() for batch, length, kmer_size in [(batch1, "_short", "23"), (batch2, "_long", "31")]: BatchKeyValue(batch=batch, key="length", value=length).save() BatchKeyValue(batch=batch, key="kmer_size", value=kmer_size).save() file1 = File(size_in_bytes=0, download_url=self.fasta_download_url, raw_format="fa.gz", processed_format="tar.gz", name="Aegilops_tauschii{}.fa.gz".format(length), internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX", batch=batch) file2 = File(size_in_bytes=0, download_url=self.gtf_download_url, raw_format="gtf.gz", processed_format="tar.gz", name="Aegilops_tauschii{}.gtf.gz".format(length), internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX", batch=batch) file1.save() file2.save() batch.files = [file1, file2] return [batch1, batch2]
def insert_objects(self) -> List[Batch]: download_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip" # noqa batch = Batch(survey_job=self.survey_job, source_type="SRA", pipeline_required="SALMON", platform_accession_code="IlluminaHiSeq2000", experiment_accession_code="DRX001563", experiment_title="It doesn't really matter.", organism_id=9031, organism_name="GALLUS GALLUS", release_date="2013-07-19", last_uploaded_date="2017-09-11", status=BatchStatuses.NEW.value) batch.save() file = File( size_in_bytes=0, download_url= "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/DRR002/DRR002116/DRR002116_1.fastq.gz", # noqa raw_format="fastq.gz", processed_format="tar.gz", name="DRR002116_1.fastq.gz", internal_location="IlluminaHiSeq2000/SALMON", batch=batch) file2 = File( size_in_bytes=0, download_url= "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/DRR002/DRR002116/DRR002116_2.fastq.gz", # noqa raw_format="fastq.gz", processed_format="tar.gz", name="DRR002116_2.fastq.gz", internal_location="IlluminaHiSeq2000/SALMON", batch=batch) file.save() file2.save() batch.files = [file, file2] return (batch, [file, file2])
def insert_objects(self) -> List[Batch]: download_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip" # noqa batch = Batch(survey_job=self.survey_job, source_type="ARRAY_EXPRESS", pipeline_required="AFFY_TO_PCL", platform_accession_code="A-AFFY-1", experiment_accession_code="E-MTAB-3050", experiment_title="It doesn't really matter.", organism_id=9606, organism_name="H**O SAPIENS", release_date="2017-05-05", last_uploaded_date="2017-05-05", status=BatchStatuses.NEW.value) batch2 = copy.deepcopy(batch) batch.save() batch2.save() file = File(size_in_bytes=0, download_url=download_url, raw_format="CEL", processed_format="PCL", name="CE1234.CEL", internal_location="A-AFFY-1/AFFY_TO_PCL/", batch=batch) file2 = File(size_in_bytes=0, download_url=download_url, raw_format="CEL", processed_format="PCL", name="CE2345.CEL", internal_location="A-AFFY-1/AFFY_TO_PCL/", batch=batch2) file.save() file2.save() batch.files = [file] batch2.files = [file] return ([batch, batch2], [file, file2])
def _generate_batches(self, samples: List[Dict], experiment: Dict, replicate_raw: bool = True) -> List[Batch]: """Generates a Batch for each sample in samples. Uses the metadata contained in experiment (which should be generated via get_experiment_metadata) to add additional metadata to each Batch. If replicate_raw is True (the default) then only raw files will be replicated. Otherwise all files will be replicated. """ for sample in samples: if "file" not in sample: continue organism_name = "UNKNOWN" for characteristic in sample["characteristic"]: if characteristic["category"].upper() == "ORGANISM": organism_name = characteristic["value"].upper() if organism_name == "UNKNOWN": logger.error( "Sample from experiment %s did not specify the organism name.", experiment["experiment_accession_code"], survey_job=self.survey_job.id) organism_id = 0 else: organism_id = Organism.get_id_for_name(organism_name) for sample_file in sample["file"]: # Generally we only want to replicate the raw data if # we can, however if there isn't raw data then we'll # take the processed stuff. if (replicate_raw and sample_file["type"] != "data") \ or sample_file["name"] is None: continue # sample_file["comment"] is only a list if there's # more than one comment... comments = sample_file["comment"] if isinstance(comments, list): # Could be: "Derived ArrayExpress Data Matrix FTP # file" or: "ArrayExpress FTP file". If there is # no comment with a name including "FTP file" then # we don't know where to download it so we need to # mark this job as an error. Therefore don't catch # the potential exception where download_url # doesn't get defined. for comment in comments: if comment["name"].find("FTP file") != -1: download_url = comment["value"] else: download_url = comments["value"] raw_format = sample_file["name"].split(".")[-1] processed_format = "PCL" if replicate_raw else raw_format file = File( name=sample_file["name"], download_url=download_url, raw_format=raw_format, processed_format=processed_format, size_in_bytes=-1) # Will have to be determined later self.add_batch( platform_accession_code=experiment[ "platform_accession_code"], experiment_accession_code=experiment[ "experiment_accession_code"], organism_id=organism_id, organism_name=organism_name, experiment_title=experiment["name"], release_date=experiment["release_date"], last_uploaded_date=experiment["last_update_date"], files=[file])