def _end_job(survey_job: SurveyJob, success=True) -> SurveyJob: """Ends survey job, setting success and time properties.""" survey_job.success = success survey_job.end_time = timezone.now() survey_job.save() return survey_job
def test_single_plant(self): """ Tests that the files returned actually exist. Tests the Metazoa division instead of the main division. """ survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="ensembl_division", value="EnsemblPlants") key_value_pair.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="organism_name", value="Arabidopsis thaliana") key_value_pair.save() surveyor = TranscriptomeIndexSurveyor(survey_job) files = surveyor.discover_species()[0] for file in files: urllib.request.urlopen(file.source_url) # Make sure the organism object got created by making sure # this doesn't raise an exception. Organism.objects.get(name="ARABIDOPSIS_THALIANA")
def _insert_salmon_index(): """Creates a batch for the index for the organism for the test.""" survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() batch = Batch(survey_job=survey_job, source_type="TRANSCRIPTOME_INDEX", pipeline_required="TRANSCRIPTOME_INDEX", platform_accession_code="TEST", experiment_accession_code="HOMO_SAPIENS", experiment_title="It doesn't really matter.", organism_id=9606, organism_name="H**O SAPIENS", release_date="2017-11-02", last_uploaded_date="2017-11-02", status=BatchStatuses.PROCESSED.value) batch.save() kmer_size = BatchKeyValue(key="kmer_size", value="23", batch=batch) kmer_size.save() index_file = File( size_in_bytes=2214725074, raw_format="gtf.gz", processed_format="tar.gz", name="Homo_sapiens_short.gtf.gz", internal_location="TEST/TRANSCRIPTOME_INDEX", download_url=("ftp://ftp.ensembl.org/pub/release-90/gtf/homo_sapiens" "/Homo_sapiens.GRCh38.90.gtf.gz"), batch=batch) index_file.save()
def test_survey_bacteria(self, mock_send_job): survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="ensembl_division", value="EnsemblBacteria") key_value_pair.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="organism_name", value="PSEUDOMONAS_AERUGINOSA") key_value_pair.save() surveyor = TranscriptomeIndexSurveyor(survey_job) surveyor.survey(source_type="TRANSCRIPTOME_INDEX") downloader_jobs = DownloaderJob.objects.order_by("id").all() self.assertEqual(downloader_jobs.count(), 1) send_job_calls = [] for downloader_job in downloader_jobs: send_job_calls.append( call(Downloaders.TRANSCRIPTOME_INDEX, downloader_job)) mock_send_job.assert_has_calls(send_job_calls) # Make sure the organism object got created with the correct # taxonomy id by making sure this doesn't raise an exception. Organism.objects.get(name="PSEUDOMONAS_AERUGINOSA", taxonomy_id=287)
def get_batch(): survey_job = SurveyJob(source_type="ARRAY_EXPRESS") survey_job.save() batch = Batch(survey_job=survey_job, source_type="ARRAY_EXPRESS", pipeline_required="AFFY_TO_PCL", platform_accession_code="A-AFFY-1", experiment_accession_code="E-MTAB-3050", experiment_title="It doesn't really matter.", organism_id=9606, organism_name="H**O SAPIENS", release_date="2017-05-05", last_uploaded_date="2017-05-05", status=BatchStatuses.NEW.value) batch.save() File(size_in_bytes=0, download_url="example.com", raw_format="CEL", processed_format="PCL", name="CE1234.CEL", internal_location="A-AFFY-1/AFFY_TO_PCL/", batch=batch).save() return batch
def test_calls_survey(self, mock_get): """If source_type is supported calls the appropriate survey method.""" mock_get.side_effect = mocked_requests_get # Prevent a call being made to NCBI's API to determine # organism name/id. organism = Organism(name="H**O SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() survey_job = SurveyJob(source_type="ARRAY_EXPRESS") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value="E-GEOD-22166") key_value_pair.save() surveyor.run_job(survey_job) logger.info("Started Survey Job %d, waiting for it to complete.", survey_job.id) survey_job = wait_for_job(survey_job, SurveyJob) self.assertTrue(survey_job.success) batch = Batch.objects.all()[0] batch = Batch.objects.filter(survey_job=survey_job).get() downloader_job = batch.downloaderjob_set.get() logger.info("Survey Job finished, waiting for Downloader Job %d to complete.", downloader_job.id) downloader_job = wait_for_job(downloader_job, DownloaderJob) self.assertTrue(downloader_job.success) processor_job = batch.processorjob_set.get() logger.info("Downloader Job finished, waiting for processor Job %d to complete.", processor_job.id) processor_job = wait_for_job(processor_job, ProcessorJob) self.assertTrue(processor_job.success)
def handle(self, *args, **options): # Create all the dummy data that would have been created # before a downloader job could have been generated. survey_job = SurveyJob(source_type="ARRAY_EXPRESS") survey_job.save() batch = Batch(survey_job=survey_job, source_type="ARRAY_EXPRESS", pipeline_required="AFFY_TO_PCL", platform_accession_code="A-AFFY-141", experiment_accession_code="E-GEOD-59071", experiment_title="It doesn't really matter.", organism_id=9606, organism_name="H**O SAPIENS", release_date="2017-05-05", last_uploaded_date="2017-05-05", status=BatchStatuses.NEW.value) batch.save() file = File( batch=batch, size_in_bytes=0, download_url= "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip", # noqa raw_format="CEL", processed_format="PCL", name="GSM1426072_CD_colon_active_2.CEL", internal_location="A-AFFY-141/AFFY_TO_PCL") file.save() downloader_job = DownloaderJob.create_job_and_relationships( batches=[batch]) send_job(Downloaders["ARRAY_EXPRESS"], downloader_job.id)
def test_correct_index_location(self): """ Tests that the files returned actually exist. Uses an organism in the main division. """ survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="ensembl_division", value="Ensembl") key_value_pair.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="organism_name", value="Danio rerio") key_value_pair.save() surveyor = TranscriptomeIndexSurveyor(survey_job) files = surveyor.discover_species()[0] # Make sure the organism object got created by making sure # this doesn't raise an exception. Organism.objects.get(name="DANIO_RERIO") for file in files: urllib.request.urlopen(file.source_url)
def init_objects(): survey_job = SurveyJob(source_type="ARRAY_EXPRESS") survey_job.save() batch = Batch( survey_job=survey_job, source_type="ARRAY_EXPRESS", pipeline_required="AFFY_TO_PCL", platform_accession_code="A-AFFY-1", experiment_accession_code="E-MTAB-3050", experiment_title="It doesn't really matter.", organism_id=9606, organism_name="H**O SAPIENS", release_date="2017-05-05", last_uploaded_date="2017-05-05", status=BatchStatuses.DOWNLOADED.value ) batch.save() file = File(size_in_bytes=0, download_url="ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip", # noqa raw_format="CEL", processed_format="PCL", name="CE1234.CEL", internal_location="A-AFFY-1/AFFY_TO_PCL/", batch=batch) file.save() batch.files = [file] return batch
def test_correct_index_location_metazoa(self): """ Tests that the files returned actually exist. Tests the Metazoa division instead of the main division. """ survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="ensembl_division", value="EnsemblMetazoa") key_value_pair.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="organism_name", value="Octopus bimaculoides") key_value_pair.save() surveyor = TranscriptomeIndexSurveyor(survey_job) files = surveyor.discover_species()[0] for file in files: urllib.request.urlopen(file.source_url) # Make sure the organism object got created by making sure # this doesn't raise an exception. Organism.objects.get(name="OCTOPUS_BIMACULOIDES")
def test_survey_unmated_reads(self, mock_send_job): """Test an experiment with unmated reads. Also make sure the file report endpoint's properties are recorded. """ survey_job = SurveyJob(source_type="SRA") survey_job.save() key_value_pair = SurveyJobKeyValue( survey_job=survey_job, key="experiment_accession_code", value="SRP048683" ) key_value_pair.save() sra_surveyor = SraSurveyor(survey_job) experiment, samples = sra_surveyor.discover_experiment_and_samples() self.assertEqual(experiment.accession_code, "SRP048683") self.assertEqual(len(samples), 12) expected_file_names = set() # Just check one file for one sample's expected file size/md5 for sample in samples: if sample.accession_code == "SRR1603661": for original_file in sample.original_files.all(): expected_file_names.add(original_file.source_filename) if original_file.source_filename == "SRR1603661_1.fastq.gz": self.assertEqual( original_file.expected_md5, "502a9a482bfa5aa75865ccc0105ad13c" ) self.assertEqual(original_file.expected_size_in_bytes, 6751980628) self.assertEqual({"SRR1603661_1.fastq.gz", "SRR1603661_2.fastq.gz"}, expected_file_names)
def setUpClass(cls): survey_job = SurveyJob( source_type="ARRAY_EXPRESS" ) survey_job.save() batch = Batch( survey_job=survey_job, source_type="ARRAY_EXPRESS", pipeline_required="AFFY_TO_PCL", platform_accession_code="A-AFFY-141", experiment_accession_code="E-GEOD-59071", experiment_title="It doesn't really matter.", organism_id=9606, organism_name="H**O SAPIENS", release_date="2017-05-05", last_uploaded_date="2017-05-05", status=BatchStatuses.NEW.value ) batch.save() file = File( size_in_bytes=0, download_url="ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip", # noqa raw_format="CEL", processed_format="PCL", name="GSM1426072.CEL", internal_location="A-AFFY-141/AFFY_TO_PCL", batch=batch ) file.save() super(FilesTestCase, cls).setUpClass()
def setUp(self): survey_job = SurveyJob(source_type="SRA") survey_job.save() self.survey_job = survey_job key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value="DRR002116") key_value_pair.save() # Insert the organism into the database so the model doesn't call the # taxonomy API to populate it. organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() organism1 = Organism(name="GALLUS_GALLUS", taxonomy_id=9031, is_scientific_name=True) organism1.save() organism2 = Organism(name="DANIO_RERIO", taxonomy_id=7955, is_scientific_name=True) organism2.save()
def survey_ae_experiment(experiment_accession): survey_job = SurveyJob(source_type="ARRAY_EXPRESS") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value=experiment_accession) key_value_pair.save() run_job(survey_job)
def prep_test(self, experiment_accession): survey_job = SurveyJob(source_type="GEO") survey_job.save() self.survey_job = survey_job key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value=experiment_accession) key_value_pair.save()
def test(): survey_job = SurveyJob(source_type="ARRAY_EXPRESS") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value="E-MTAB-3050") key_value_pair.save() run_job(survey_job) return
def setUp(self): survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() self.survey_job = survey_job key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="ensembl_division", value="EnsemblPlants") key_value_pair.save()
def test_run_unsupported_source(self): """If source_type is unsupported the job still is started and ended.""" job = SurveyJob(source_type="UNSUPPORTED") job.save() surveyor.run_job(job) self.assertIsInstance(job.start_time, datetime.datetime) self.assertIsInstance(job.end_time, datetime.datetime) self.assertFalse(job.success)
def create_job_for_accession(self, accession_code: str): survey_job = SurveyJob(source_type="ARRAY_EXPRESS") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value=accession_code) key_value_pair.save() return survey_job
def run_trasnscriptome_processor(self): # Create all the dummy data that would have been created # before a processor job could have been generated. survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() batch = Batch( survey_job=survey_job, source_type="TRANSCRIPTOME_INDEX", pipeline_required="TRANSCRIPTOME_INDEX", platform_accession_code="EnsemblPlants", experiment_accession_code="aegilops_tauschii", experiment_title="It doesn't really matter.", organism_id=37682, organism_name="AEGILOPS TAUSCHII", release_date="2017-11-02", last_uploaded_date="2017-11-02", status=BatchStatuses.DOWNLOADED.value, ) batch.save() kmer_size_property = BatchKeyValue(batch=batch, key="kmer_size", value="31") kmer_size_property.save() gtf_file = File( name="aegilops_tauschii_short.gtf.gz", download_url=( "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/gtf" "/aegilops_tauschii/Aegilops_tauschii.ASM34733v1.37.gtf.gz"), raw_format="gtf.gz", processed_format="tar.gz", internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX", size_in_bytes=-1, batch=batch) gtf_file.save() fasta_file = File( name="aegilops_tauschii_short.fa.gz", download_url=( "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/fasta" "/aegilops_tauschii/dna/Aegilops_tauschii." "ASM34733v1.dna.toplevel.fa.gz"), raw_format="fa.gz", processed_format="tar.gz", internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX", size_in_bytes=-1, batch=batch) fasta_file.save() processor_job = ProcessorJob.create_job_and_relationships( batches=[batch]) logger.info("Queuing a processor job.") send_job(ProcessorPipeline[batch.pipeline_required], processor_job.id)
def requeue_survey_job(last_job: SurveyJob) -> None: """Queues a new survey job. The new survey job will have num_retries one greater than last_job.num_retries. """ num_retries = last_job.num_retries + 1 new_job = SurveyJob(num_retries=num_retries, source_type=last_job.source_type) if new_job.num_retries == 1: new_job.ram_amount = 4096 elif new_job.num_retries in [2, 3]: new_job.ram_amount = 16384 else: new_job.ram_amount = 1024 new_job.save() keyvalues = SurveyJobKeyValue.objects.filter(survey_job=last_job) for keyvalue in keyvalues: SurveyJobKeyValue.objects.get_or_create( survey_job=new_job, key=keyvalue.key, value=keyvalue.value, ) logger.debug( "Requeuing SurveyJob which had ID %d with a new SurveyJob with ID %d.", last_job.id, new_job.id, ) try: if send_job(SurveyJobTypes.SURVEYOR, job=new_job, is_dispatch=True): last_job.retried = True last_job.success = False last_job.retried_job = new_job last_job.save() else: # Can't communicate with Batch just now, leave the job for a later loop. new_job.delete() except Exception: logger.error( "Failed to requeue Survey Job which had ID %d with a new Surevey Job with ID %d.", last_job.id, new_job.id, ) # Can't communicate with AWS just now, leave the job for a later loop. new_job.delete() return True
def setUp(self): survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() self.survey_job = survey_job self.gtf_download_url = ( "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/gtf/" "aegilops_tauschii/Aegilops_tauschii.ASM34733v1.37.gtf.gz") self.fasta_download_url = ( "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/fasta/" "aegilops_tauschii/dna/Aegilops_tauschii.ASM34733v1.dna.toplevel.fa.gz" )
def test_calls_survey(self, survey_method): """If source_type is supported calls the appropriate survey method.""" survey_method.return_value = True job = SurveyJob(source_type="ARRAY_EXPRESS") job.save() surveyor.run_job(job) self.assertEqual(len(survey_method.mock_calls), 1) self.assertIsInstance(job.start_time, datetime.datetime) self.assertIsInstance(job.end_time, datetime.datetime) self.assertTrue(job.success)
def survey_sra_experiments(start_accession, end_accession): survey_job = SurveyJob(source_type="SRA") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="start_accession", value=start_accession) key_value_pair.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="end_accession", value=end_accession) key_value_pair.save() run_job(survey_job)
def _start_job(survey_job: SurveyJob) -> SurveyJob: """Start survey job, setting time properties.""" logger.debug("Starting Survey Job for source type: %s.", survey_job.source_type, survey_job=survey_job.id) survey_job.start_time = timezone.now() survey_job.save() global CURRENT_JOB CURRENT_JOB = survey_job return survey_job
def _start_job(survey_job: SurveyJob): logger.info("Starting Survey Job for source type: %s.", survey_job.source_type, survey_job=survey_job.id) survey_job.start_time = timezone.now() survey_job.replication_started_at = timezone.now() # If the end of the replication range is not already set, # set it to the current time. if survey_job.replication_ended_at is None: survey_job.replication_ended_at = timezone.now() survey_job.save()
def test_jobs_sanity(self): """Just makes sure creating Jobs doesn't fail""" s_job = SurveyJob() s_job.save() processor_job = ProcessorJob() processor_job.pipeline_applied = "test0" processor_job.save() dl_job = DownloaderJob() dl_job.downloader_task = "XYZ" dl_job.accession_code = "123" dl_job.save()
def test_nonexistant_srp_survey(self): """Try surveying an accession that does not exist """ survey_job = SurveyJob(source_type="SRA") survey_job.save() key_value_pair = SurveyJobKeyValue( survey_job=survey_job, key="experiment_accession_code", value="ERP006216" ) key_value_pair.save() run_job(survey_job) survey_job.refresh_from_db() self.assertFalse(survey_job.success) self.assertEqual(survey_job.failure_reason, "No experiment found.")
def create_survey_job(self): job = SurveyJob(source_type="SRA", nomad_job_id="SURVEYOR/dispatch-1528945054-e8eaf540", num_retries=0, success=None) job.save() sjkv = SurveyJobKeyValue() sjkv.key = "experiment_accession_code" sjkv.value = "RJ-1234-XYZ" sjkv.survey_job = job sjkv.save() return job
def test_batch_created(self, mock_get): mock_get.side_effect = mocked_requests_get # Use same run accession for the start and end of the range to # achieve a length of 1 survey_job = SurveyJob(source_type="SRA") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="start_accession", value=RUN_ACCESSION) key_value_pair.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="end_accession", value=RUN_ACCESSION) key_value_pair.save() surveyor = SraSurveyor(survey_job) self.assertTrue(surveyor.discover_batches()) # With only a single run accession there should only be a # single batch. self.assertEqual(len(surveyor.batches), 1) batch = surveyor.batches[0] self.assertEqual(batch.survey_job.id, survey_job.id) self.assertEqual(batch.source_type, "SRA") self.assertEqual(batch.pipeline_required, "SALMON") self.assertEqual(batch.platform_accession_code, "IlluminaHiSeq2000") self.assertEqual(batch.experiment_accession_code, "DRX001563") self.assertEqual(batch.experiment_title, ("Illumina HiSeq 2000 sequencing; " "Exp_Gg_HH16_1_embryo_mRNAseq")) self.assertEqual(batch.status, "NEW") self.assertEqual(batch.release_date, "2013-07-19") self.assertEqual(batch.last_uploaded_date, "2017-08-11") self.assertEqual(batch.organism_id, 9031) self.assertEqual(batch.organism_name, "GALLUS GALLUS") file = batch.files[0] self.assertEqual(file.size_in_bytes, -1) self.assertEqual( file.download_url, "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/DRR002/DRR002116/DRR002116.fastq.gz" ) # noqa self.assertEqual(file.raw_format, "fastq.gz") self.assertEqual(file.processed_format, "tar.gz") self.assertEqual(file.name, "DRR002116.fastq.gz") self.assertEqual(file.internal_location, "IlluminaHiSeq2000/SALMON")