def setUp(self): survey_job = SurveyJob(source_type="SRA") survey_job.save() self.survey_job = survey_job key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value="DRR002116") key_value_pair.save() # Insert the organism into the database so the model doesn't call the # taxonomy API to populate it. organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() organism1 = Organism(name="GALLUS_GALLUS", taxonomy_id=9031, is_scientific_name=True) organism1.save() organism2 = Organism(name="DANIO_RERIO", taxonomy_id=7955, is_scientific_name=True) organism2.save()
def test_survey_unmated_reads(self, mock_send_job): """Test an experiment with unmated reads. Also make sure the file report endpoint's properties are recorded. """ survey_job = SurveyJob(source_type="SRA") survey_job.save() key_value_pair = SurveyJobKeyValue( survey_job=survey_job, key="experiment_accession_code", value="SRP048683" ) key_value_pair.save() sra_surveyor = SraSurveyor(survey_job) experiment, samples = sra_surveyor.discover_experiment_and_samples() self.assertEqual(experiment.accession_code, "SRP048683") self.assertEqual(len(samples), 12) expected_file_names = set() # Just check one file for one sample's expected file size/md5 for sample in samples: if sample.accession_code == "SRR1603661": for original_file in sample.original_files.all(): expected_file_names.add(original_file.source_filename) if original_file.source_filename == "SRR1603661_1.fastq.gz": self.assertEqual( original_file.expected_md5, "502a9a482bfa5aa75865ccc0105ad13c" ) self.assertEqual(original_file.expected_size_in_bytes, 6751980628) self.assertEqual({"SRR1603661_1.fastq.gz", "SRR1603661_2.fastq.gz"}, expected_file_names)
def test_calls_survey(self, mock_get): """If source_type is supported calls the appropriate survey method.""" mock_get.side_effect = mocked_requests_get # Prevent a call being made to NCBI's API to determine # organism name/id. organism = Organism(name="H**O SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() survey_job = SurveyJob(source_type="ARRAY_EXPRESS") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value="E-GEOD-22166") key_value_pair.save() surveyor.run_job(survey_job) logger.info("Started Survey Job %d, waiting for it to complete.", survey_job.id) survey_job = wait_for_job(survey_job, SurveyJob) self.assertTrue(survey_job.success) batch = Batch.objects.all()[0] batch = Batch.objects.filter(survey_job=survey_job).get() downloader_job = batch.downloaderjob_set.get() logger.info("Survey Job finished, waiting for Downloader Job %d to complete.", downloader_job.id) downloader_job = wait_for_job(downloader_job, DownloaderJob) self.assertTrue(downloader_job.success) processor_job = batch.processorjob_set.get() logger.info("Downloader Job finished, waiting for processor Job %d to complete.", processor_job.id) processor_job = wait_for_job(processor_job, ProcessorJob) self.assertTrue(processor_job.success)
def survey_ae_experiment(experiment_accession): survey_job = SurveyJob(source_type="ARRAY_EXPRESS") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value=experiment_accession) key_value_pair.save() run_job(survey_job)
def setUp(self): survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() self.survey_job = survey_job key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="ensembl_division", value="EnsemblPlants") key_value_pair.save()
def test(): survey_job = SurveyJob(source_type="ARRAY_EXPRESS") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value="E-MTAB-3050") key_value_pair.save() run_job(survey_job) return
def prep_test(self, experiment_accession): survey_job = SurveyJob(source_type="GEO") survey_job.save() self.survey_job = survey_job key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value=experiment_accession) key_value_pair.save()
def queue_surveyor_for_accession(accession: str) -> None: """Dispatches a surveyor job for the accession code.""" # Start at 256MB of RAM for surveyor jobs. survey_job = SurveyJob(ram_amount=256) set_source_type_for_accession(survey_job, accession) key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value=accession) key_value_pair.save()
def create_job_for_accession(self, accession_code: str): survey_job = SurveyJob(source_type="ARRAY_EXPRESS") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value=accession_code) key_value_pair.save() return survey_job
def test_survey_bacteria(self, mock_send_job): survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="ensembl_division", value="EnsemblBacteria") key_value_pair.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="organism_name", value="PSEUDOMONAS_AERUGINOSA") key_value_pair.save() surveyor = TranscriptomeIndexSurveyor(survey_job) surveyor.survey(source_type="TRANSCRIPTOME_INDEX") downloader_jobs = DownloaderJob.objects.order_by("id").all() self.assertEqual(downloader_jobs.count(), 1) send_job_calls = [] for downloader_job in downloader_jobs: send_job_calls.append( call(Downloaders.TRANSCRIPTOME_INDEX, downloader_job)) mock_send_job.assert_has_calls(send_job_calls) # Make sure the organism object got created with the correct # taxonomy id by making sure this doesn't raise an exception. Organism.objects.get(name="PSEUDOMONAS_AERUGINOSA", taxonomy_id=287)
def test_single_plant(self): """ Tests that the files returned actually exist. Tests the Metazoa division instead of the main division. """ survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="ensembl_division", value="EnsemblPlants") key_value_pair.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="organism_name", value="Arabidopsis thaliana") key_value_pair.save() surveyor = TranscriptomeIndexSurveyor(survey_job) files = surveyor.discover_species()[0] for file in files: urllib.request.urlopen(file.source_url) # Make sure the organism object got created by making sure # this doesn't raise an exception. Organism.objects.get(name="ARABIDOPSIS_THALIANA")
def test_correct_index_location_metazoa(self): """ Tests that the files returned actually exist. Tests the Metazoa division instead of the main division. """ survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="ensembl_division", value="EnsemblMetazoa") key_value_pair.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="organism_name", value="Octopus bimaculoides") key_value_pair.save() surveyor = TranscriptomeIndexSurveyor(survey_job) files = surveyor.discover_species()[0] for file in files: urllib.request.urlopen(file.source_url) # Make sure the organism object got created by making sure # this doesn't raise an exception. Organism.objects.get(name="OCTOPUS_BIMACULOIDES")
def queue_surveyor_for_accession(accession: str) -> None: """Dispatches a surveyor job for the accession code.""" # Start at 1GB of RAM for surveyor jobs. survey_job = SurveyJob(ram_amount=1024) set_source_type_for_accession(survey_job, accession) key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value=accession) key_value_pair.save() # We don't actually send the job here, we just create it. # The foreman will pick it up and dispatch it when the time is appropriate. return survey_job
def test_correct_index_location(self): """ Tests that the files returned actually exist. Uses an organism in the main division. """ survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="ensembl_division", value="Ensembl") key_value_pair.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="organism_name", value="Danio rerio") key_value_pair.save() surveyor = TranscriptomeIndexSurveyor(survey_job) files = surveyor.discover_species()[0] # Make sure the organism object got created by making sure # this doesn't raise an exception. Organism.objects.get(name="DANIO_RERIO") for file in files: urllib.request.urlopen(file.source_url)
def test_nonexistant_srp_survey(self): """Try surveying an accession that does not exist """ survey_job = SurveyJob(source_type="SRA") survey_job.save() key_value_pair = SurveyJobKeyValue( survey_job=survey_job, key="experiment_accession_code", value="ERP006216" ) key_value_pair.save() run_job(survey_job) survey_job.refresh_from_db() self.assertFalse(survey_job.success) self.assertEqual(survey_job.failure_reason, "No experiment found.")
def setUp(self): survey_job = SurveyJob(source_type="ARRAY_EXPRESS") survey_job.save() self.survey_job = survey_job key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value="E-MTAB-3050") key_value_pair.save() # Insert the organism into the database so the model doesn't call the # taxonomy API to populate it. organism = Organism(name="H**O SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save()
def test_arrayexpress_alternate_accession(self): """ Make sure that ENA experiments correctly detect their ArrayExpress alternate accession """ survey_job = SurveyJob(source_type="SRA") survey_job.save() key_value_pair = SurveyJobKeyValue( survey_job=survey_job, key="experiment_accession_code", value="ERP108370" ) key_value_pair.save() sra_surveyor = SraSurveyor(survey_job) experiment, _ = sra_surveyor.discover_experiment_and_samples() self.assertEqual(experiment.accession_code, "ERP108370") self.assertEqual(experiment.alternate_accession_code, "E-MTAB-6681")
def test_survey_fungi_none(self, mock_send_job): """When surveying fungi an organism_name must be supplied.""" survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="ensembl_division", value="EnsemblFungi") key_value_pair.save() surveyor = TranscriptomeIndexSurveyor(survey_job) surveyor.survey(source_type="TRANSCRIPTOME_INDEX") downloader_jobs = DownloaderJob.objects.order_by("id").all() self.assertEqual(downloader_jobs.count(), 0) mock_send_job.assert_not_called()
def survey_experiment(experiment_accession: str, source_type: str): """Survey an experiment of type `source_type`. Source type corresponds to one of the external sources we support. It must be one of the following values: * SRA * GEO * ARRAY_EXPRESS """ survey_job = SurveyJob(source_type=source_type) survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value=experiment_accession) key_value_pair.save() run_job(survey_job) return survey_job
def test_survey(self, mock_send_job): survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() key_value_pair = SurveyJobKeyValue( survey_job=survey_job, key="ensembl_division", value="EnsemblPlants" ) key_value_pair.save() surveyor = TranscriptomeIndexSurveyor(survey_job) surveyor.survey(source_type="TRANSCRIPTOME_INDEX") downloader_jobs = DownloaderJob.objects.order_by("id").all() self.assertGreater(downloader_jobs.count(), 50) send_job_calls = [] for downloader_job in downloader_jobs: send_job_calls.append(call(Downloaders.TRANSCRIPTOME_INDEX, downloader_job)) mock_send_job.assert_has_calls(send_job_calls)
def survey_sra_experiments(start_accession, end_accession): survey_job = SurveyJob(source_type="SRA") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="start_accession", value=start_accession) key_value_pair.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="end_accession", value=end_accession) key_value_pair.save() run_job(survey_job)
def test_batch_created(self, mock_get): mock_get.side_effect = mocked_requests_get # Use same run accession for the start and end of the range to # achieve a length of 1 survey_job = SurveyJob(source_type="SRA") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="start_accession", value=RUN_ACCESSION) key_value_pair.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="end_accession", value=RUN_ACCESSION) key_value_pair.save() surveyor = SraSurveyor(survey_job) self.assertTrue(surveyor.discover_batches()) # With only a single run accession there should only be a # single batch. self.assertEqual(len(surveyor.batches), 1) batch = surveyor.batches[0] self.assertEqual(batch.survey_job.id, survey_job.id) self.assertEqual(batch.source_type, "SRA") self.assertEqual(batch.pipeline_required, "SALMON") self.assertEqual(batch.platform_accession_code, "IlluminaHiSeq2000") self.assertEqual(batch.experiment_accession_code, "DRX001563") self.assertEqual(batch.experiment_title, ("Illumina HiSeq 2000 sequencing; " "Exp_Gg_HH16_1_embryo_mRNAseq")) self.assertEqual(batch.status, "NEW") self.assertEqual(batch.release_date, "2013-07-19") self.assertEqual(batch.last_uploaded_date, "2017-08-11") self.assertEqual(batch.organism_id, 9031) self.assertEqual(batch.organism_name, "GALLUS GALLUS") file = batch.files[0] self.assertEqual(file.size_in_bytes, -1) self.assertEqual( file.download_url, "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/DRR002/DRR002116/DRR002116.fastq.gz" ) # noqa self.assertEqual(file.raw_format, "fastq.gz") self.assertEqual(file.processed_format, "tar.gz") self.assertEqual(file.name, "DRR002116.fastq.gz") self.assertEqual(file.internal_location, "IlluminaHiSeq2000/SALMON")
def create_survey_job(self): job = SurveyJob(source_type="SRA", nomad_job_id="SURVEYOR/dispatch-1528945054-e8eaf540", num_retries=0, success=None) job.save() sjkv = SurveyJobKeyValue() sjkv.key = "experiment_accession_code" sjkv.value = "RJ-1234-XYZ" sjkv.survey_job = job sjkv.save() return job
def test_discover_batches(self, mock_generate_batch): survey_job = SurveyJob(source_type="SRA") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="start_accession", value="DRR012345") key_value_pair.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="end_accession", value="DRR012348") key_value_pair.save() sra_surveyor = SraSurveyor(survey_job) sra_surveyor.discover_batches() mock_generate_batch.assert_has_calls([ call("DRR012345"), call("DRR012346"), call("DRR012347"), call("DRR012348") ])
def set_source_type_for_accession(survey_job, accession: str) -> None: """Type a surveyor based on accession structure""" if 'GSE' in accession[:3]: survey_job.source_type = "GEO" survey_job.save() return elif 'E-' in accession[:2]: survey_job.source_type = "ARRAY_EXPRESS" survey_job.save() return elif " " in accession: survey_job.source_type = "TRANSCRIPTOME_INDEX" survey_job.save() args = accession.split(",") # Allow organism to be unspecified so we survey the entire division. organism_name = args[0] if len(args[0]) > 0 else None if len(args) > 1: ensembl_division = args[1].strip() else: ensembl_division = "Ensembl" key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="ensembl_division", value=ensembl_division) key_value_pair.save() if organism_name: key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="organism_name", value=organism_name) key_value_pair.save() return else: survey_job.source_type = "SRA" survey_job.save() return
def test_correct_index_location_protist(self): """ Tests that the files returned actually exist. Tests the Metazoa division instead of the main division. """ survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="ensembl_division", value="EnsemblProtists") key_value_pair.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="organism_name", value="Leishmania major") key_value_pair.save() surveyor = TranscriptomeIndexSurveyor(survey_job) files = surveyor.discover_species()[0] for file in files: urllib.request.urlopen(file.source_url)
def survey_transcriptome_index(organism_name=None, ensembl_division='Ensembl'): """Special one-off surveyor to build transcriptome indices. The external source this uses is ensembl.org which is divided into multiple divisions. This function surveys only one division at a time. If an `organism_name` is provided, survey only that organism, otherwise survey the entire division. """ survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="ensembl_division", value=ensembl_division) key_value_pair.save() if organism_name: key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="organism_name", value=organism_name) key_value_pair.save() run_job(survey_job) return survey_job
def test_correct_index_location(self): """ Tests that the files returned actually exist. Uses an organism in the main division. """ survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX") survey_job.save() self.survey_job = survey_job key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="ensembl_division", value="Ensembl") key_value_pair.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="organism_name", value="Danio rerio") key_value_pair.save() surveyor = TranscriptomeIndexSurveyor(self.survey_job) files = surveyor.discover_species()[0] for file in files: urllib.request.urlopen(file.source_url)
def test_srp_survey(self, mock_send_job): """A slightly harder test of the SRA surveyor. """ survey_job = SurveyJob(source_type="SRA") survey_job.save() key_value_pair = SurveyJobKeyValue( survey_job=survey_job, key="experiment_accession_code", value="SRP068364" ) key_value_pair.save() sra_surveyor = SraSurveyor(survey_job) experiment, samples = sra_surveyor.discover_experiment_and_samples() self.assertEqual(experiment.accession_code, "SRP068364") self.assertEqual(experiment.alternate_accession_code, "GSE76780") self.assertEqual(len(samples), 4) survey_job = SurveyJob(source_type="SRA") survey_job.save() key_value_pair = SurveyJobKeyValue( survey_job=survey_job, key="experiment_accession_code", value="SRP111553" ) key_value_pair.save() sra_surveyor = SraSurveyor(survey_job) experiment, samples = sra_surveyor.discover_experiment_and_samples() self.assertEqual(experiment.accession_code, "SRP111553") self.assertEqual(experiment.alternate_accession_code, "GSE101204") self.assertEqual(len(samples), 16) # 8 samples with 2 runs each survey_job = SurveyJob(source_type="SRA") survey_job.save() key_value_pair = SurveyJobKeyValue( survey_job=survey_job, key="experiment_accession_code", value="DRP003977" ) key_value_pair.save() sra_surveyor = SraSurveyor(survey_job) experiment, samples = sra_surveyor.discover_experiment_and_samples() self.assertEqual(experiment.accession_code, "DRP003977") self.assertEqual(experiment.alternate_accession_code, None) self.assertEqual(len(samples), 9)
def test_geo_survey_microarray(self, mock_send_task): """Test that the unsurveyor works correctly. This includes not deleting samples which also belong to other experiments. Therefore we survey a superseries and one of its sub-experiments, then delete the superseries to make sure the sub-experiment wasn't touched. We mock out the send_job function so that we don't actually process these. The unsurveyor code related to ComputedFile, ComputationalResult, and ProcessorJobs won't be tested by this, but it's been functionally tested. """ superseries_accession = "GSE59795" sub_experiment_accession = "GSE46580" # Survey the superseries. survey_job = SurveyJob(source_type="GEO") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value=superseries_accession) key_value_pair.save() geo_surveyor = GeoSurveyor(survey_job) geo_surveyor.survey() # Survey the sub-experiment survey_job = SurveyJob(source_type="GEO") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value=sub_experiment_accession) key_value_pair.save() geo_surveyor = GeoSurveyor(survey_job) geo_surveyor.survey() # Establish baselines before purge experiment = Experiment.objects.filter(accession_code=sub_experiment_accession)[0] experiment_sample_assocs = ExperimentSampleAssociation.objects.filter(experiment=experiment) samples = Sample.objects.filter(id__in=experiment_sample_assocs.values('sample_id')) self.assertEqual(samples.count(), 4) og_file_sample_assocs = OriginalFileSampleAssociation.objects.filter(sample_id__in=samples.values('id')) original_files = OriginalFile.objects.filter(id__in=og_file_sample_assocs.values('original_file_id')) self.assertEqual(original_files.count(), 4) experiment = Experiment.objects.filter(accession_code=superseries_accession)[0] experiment_sample_assocs = ExperimentSampleAssociation.objects.filter(experiment=experiment) samples = Sample.objects.filter(id__in=experiment_sample_assocs.values('sample_id')) self.assertEqual(samples.count(), 20) og_file_sample_assocs = OriginalFileSampleAssociation.objects.filter(sample_id__in=samples.values('id')) original_files = OriginalFile.objects.filter(id__in=og_file_sample_assocs.values('original_file_id')) self.assertEqual(original_files.count(), 20) # Purge the superseries purge_experiment(superseries_accession) # Make sure the subexperiment samples weren't affected. experiment = Experiment.objects.filter(accession_code=sub_experiment_accession)[0] experiment_sample_assocs = ExperimentSampleAssociation.objects.filter(experiment=experiment) samples = Sample.objects.filter(id__in=experiment_sample_assocs.values('sample_id')) self.assertEqual(samples.count(), 4) # Make sure sub-experiment original files weren't affected. og_file_sample_assocs = OriginalFileSampleAssociation.objects.filter(sample_id__in=samples.values('id')) original_files = OriginalFile.objects.filter(id__in=og_file_sample_assocs.values('original_file_id')) self.assertEqual(original_files.count(), 4) # And that samples and files that remain are from the subseries. self.assertEqual(Sample.objects.count(), 4) self.assertEqual(OriginalFile.objects.count(), 4)