def test_sra_metadata_is_harmonized(self): metadata = SraSurveyor.gather_all_metadata("SRR3098582") sample = Sample() SraSurveyor._apply_harmonized_metadata_to_sample(sample, metadata) self.assertEqual(sample.treatment, "biliatresone") self.assertEqual(sample.subject, "liver") self.assertEqual(sample.specimen_part, "liver")
def test_survey_unmated_reads(self, mock_send_job): """Test an experiment with unmated reads. Also make sure the file report endpoint's properties are recorded. """ survey_job = SurveyJob(source_type="SRA") survey_job.save() key_value_pair = SurveyJobKeyValue( survey_job=survey_job, key="experiment_accession_code", value="SRP048683" ) key_value_pair.save() sra_surveyor = SraSurveyor(survey_job) experiment, samples = sra_surveyor.discover_experiment_and_samples() self.assertEqual(experiment.accession_code, "SRP048683") self.assertEqual(len(samples), 12) expected_file_names = set() # Just check one file for one sample's expected file size/md5 for sample in samples: if sample.accession_code == "SRR1603661": for original_file in sample.original_files.all(): expected_file_names.add(original_file.source_filename) if original_file.source_filename == "SRR1603661_1.fastq.gz": self.assertEqual( original_file.expected_md5, "502a9a482bfa5aa75865ccc0105ad13c" ) self.assertEqual(original_file.expected_size_in_bytes, 6751980628) self.assertEqual({"SRR1603661_1.fastq.gz", "SRR1603661_2.fastq.gz"}, expected_file_names)
def test_get_next_accession(self): self.assertEqual(SraSurveyor.get_next_accession("DRR123456"), "DRR123457") self.assertEqual(SraSurveyor.get_next_accession("DRR1234567"), "DRR1234568") self.assertEqual(SraSurveyor.get_next_accession("DRR12345678"), "DRR12345679") self.assertEqual(SraSurveyor.get_next_accession("DRR123456789"), "DRR123456790")
def test_survey(self): """A Simple test of the SRA surveyor. """ sra_surveyor = SraSurveyor(self.survey_job) sra_surveyor.discover_experiment_and_samples() samples = Sample.objects.all() # We are expecting this to discover 1 sample. self.assertEqual(samples.count(), 1) # Confirm the sample's protocol_info experiment = Experiment.objects.all().first() self.assertEqual(samples.first().protocol_info[0]["Description"], experiment.protocol_description)
def test_batch_created(self, mock_get): mock_get.side_effect = mocked_requests_get # Use same run accession for the start and end of the range to # achieve a length of 1 survey_job = SurveyJob(source_type="SRA") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="start_accession", value=RUN_ACCESSION) key_value_pair.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="end_accession", value=RUN_ACCESSION) key_value_pair.save() surveyor = SraSurveyor(survey_job) self.assertTrue(surveyor.discover_batches()) # With only a single run accession there should only be a # single batch. self.assertEqual(len(surveyor.batches), 1) batch = surveyor.batches[0] self.assertEqual(batch.survey_job.id, survey_job.id) self.assertEqual(batch.source_type, "SRA") self.assertEqual(batch.pipeline_required, "SALMON") self.assertEqual(batch.platform_accession_code, "IlluminaHiSeq2000") self.assertEqual(batch.experiment_accession_code, "DRX001563") self.assertEqual(batch.experiment_title, ("Illumina HiSeq 2000 sequencing; " "Exp_Gg_HH16_1_embryo_mRNAseq")) self.assertEqual(batch.status, "NEW") self.assertEqual(batch.release_date, "2013-07-19") self.assertEqual(batch.last_uploaded_date, "2017-08-11") self.assertEqual(batch.organism_id, 9031) self.assertEqual(batch.organism_name, "GALLUS GALLUS") file = batch.files[0] self.assertEqual(file.size_in_bytes, -1) self.assertEqual( file.download_url, "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/DRR002/DRR002116/DRR002116.fastq.gz" ) # noqa self.assertEqual(file.raw_format, "fastq.gz") self.assertEqual(file.processed_format, "tar.gz") self.assertEqual(file.name, "DRR002116.fastq.gz") self.assertEqual(file.internal_location, "IlluminaHiSeq2000/SALMON")
def test_arrayexpress_alternate_accession(self): """ Make sure that ENA experiments correctly detect their ArrayExpress alternate accession """ survey_job = SurveyJob(source_type="SRA") survey_job.save() key_value_pair = SurveyJobKeyValue( survey_job=survey_job, key="experiment_accession_code", value="ERP108370" ) key_value_pair.save() sra_surveyor = SraSurveyor(survey_job) experiment, _ = sra_surveyor.discover_experiment_and_samples() self.assertEqual(experiment.accession_code, "ERP108370") self.assertEqual(experiment.alternate_accession_code, "E-MTAB-6681")
def test_sra_metadata_is_harmonized(self): metadata = SraSurveyor.gather_all_metadata("SRR3098582") sample = Sample() SraSurveyor._apply_harmonized_metadata_to_sample(sample, metadata) self.assertEqual(sample.treatment, "biliatresone") self.assertEqual(sample.subject, "liver") self.assertEqual(sample.specimen_part, "liver") experiment = Experiment() SraSurveyor._apply_metadata_to_experiment(experiment, metadata) self.assertEqual( experiment.title, "Transcriptional profiling through RNA-seq of zebrafish larval" " liver after exposure to biliatresone, a biliary toxin.", ) self.assertEqual(experiment.source_first_published, datetime.date(2017, 9, 25)) self.assertEqual(experiment.source_last_modified, datetime.date(2017, 9, 25))
def _get_surveyor_for_source(survey_job: SurveyJob): """Factory method for ExternalSourceSurveyors.""" if survey_job.source_type == "ARRAY_EXPRESS": return ArrayExpressSurveyor(survey_job) if survey_job.source_type == "SRA": return SraSurveyor(survey_job) if survey_job.source_type == "TRANSCRIPTOME_INDEX": return TranscriptomeIndexSurveyor(survey_job) else: raise SourceNotSupportedError("Source " + survey_job.source_type + " is not supported.")
def test_discover_batches(self, mock_generate_batch): survey_job = SurveyJob(source_type="SRA") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="start_accession", value="DRR012345") key_value_pair.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="end_accession", value="DRR012348") key_value_pair.save() sra_surveyor = SraSurveyor(survey_job) sra_surveyor.discover_batches() mock_generate_batch.assert_has_calls([ call("DRR012345"), call("DRR012346"), call("DRR012347"), call("DRR012348") ])
def test_srp_survey(self, mock_send_job): """A slightly harder test of the SRA surveyor. """ survey_job = SurveyJob(source_type="SRA") survey_job.save() key_value_pair = SurveyJobKeyValue( survey_job=survey_job, key="experiment_accession_code", value="SRP068364" ) key_value_pair.save() sra_surveyor = SraSurveyor(survey_job) experiment, samples = sra_surveyor.discover_experiment_and_samples() self.assertEqual(experiment.accession_code, "SRP068364") self.assertEqual(experiment.alternate_accession_code, "GSE76780") self.assertEqual(len(samples), 4) survey_job = SurveyJob(source_type="SRA") survey_job.save() key_value_pair = SurveyJobKeyValue( survey_job=survey_job, key="experiment_accession_code", value="SRP111553" ) key_value_pair.save() sra_surveyor = SraSurveyor(survey_job) experiment, samples = sra_surveyor.discover_experiment_and_samples() self.assertEqual(experiment.accession_code, "SRP111553") self.assertEqual(experiment.alternate_accession_code, "GSE101204") self.assertEqual(len(samples), 16) # 8 samples with 2 runs each survey_job = SurveyJob(source_type="SRA") survey_job.save() key_value_pair = SurveyJobKeyValue( survey_job=survey_job, key="experiment_accession_code", value="DRP003977" ) key_value_pair.save() sra_surveyor = SraSurveyor(survey_job) experiment, samples = sra_surveyor.discover_experiment_and_samples() self.assertEqual(experiment.accession_code, "DRP003977") self.assertEqual(experiment.alternate_accession_code, None) self.assertEqual(len(samples), 9)
def test_sra_harmony(self): """ Tests a specific harmonization from SRA """ metadata = SraSurveyor.gather_all_metadata("SRR1533126") harmonized_sample = self._harmonizer.harmonize_sample(metadata) title = "Phosphaturic mesenchymal tumour (PMT) case 2 of NTUH" self.assertEqual(title, harmonized_sample["title"]) self.assertTrue("sex" in harmonized_sample.keys()) self.assertEqual("female", harmonized_sample["sex"]) self.assertTrue("age" in harmonized_sample.keys()) self.assertEqual(57.0, harmonized_sample["age"]) self.assertTrue("specimen_part" in harmonized_sample.keys()) self.assertTrue("disease" in harmonized_sample.keys())
def test_sra_lots(self): """ Smoke tests a few SRA types """ # These can be built via # https://www.ncbi.nlm.nih.gov/sra # Searching for # (human) NOT cluster_dbgap[PROP] # And then Sent To -> File -> Accession List lots = [ "ERR188021", "ERR188022", "ERR205021", "ERR205022", "ERR205023", "SRR000001", # Soft fail, bad platform "ERR1737666", "ERR030891", "ERR030892", "SRR1542948", "SRR1553477", "SRR1542330", "SRR1538698", "SRR1538760", "SRR1538866", "SRR1539218", "SRR1797277", "SRR1533126", ] for accession in lots: try: metadata = SraSurveyor.gather_all_metadata(accession) harmonized = harmonize_all_samples([metadata]) self.assertIsNotNone(harmonized) except UnsupportedDataTypeError: continue
def test_queue_downloader_jobs_for_original_files(self, mock_send_task): """Make sure that queue_downloader_jobs queues all expected Downloader jobs for a given experiment. """ # First, create an experiment with two samples associated with it # and create two original files for each of those samples. experiment_object = Experiment() experiment_object.accession_code = "Experiment1" experiment_object.save() sample_object_1 = Sample() sample_object_1.accession_code = "Sample1" sample_object_1.platform_accession_code = "Illumina Genome Analyzer" sample_object_1.platform_accession_name = "Illumina Genome Analyzer" sample_object_1.technology = "RNA-SEQ" sample_object_1.manufacturer = "ILLUMINA" sample_object_1.source_database = "SRA" sample_object_1.save() sample_object_2 = Sample() sample_object_2.accession_code = "Sample2" sample_object_2.platform_accession_code = "Illumina Genome Analyzer" sample_object_2.platform_accession_name = "Illumina Genome Analyzer" sample_object_2.technology = "RNA-SEQ" sample_object_2.manufacturer = "ILLUMINA" sample_object_2.source_database = "SRA" sample_object_2.save() association = ExperimentSampleAssociation() association.experiment = experiment_object association.sample = sample_object_1 association.save() association = ExperimentSampleAssociation() association.experiment = experiment_object association.sample = sample_object_2 association.save() sample_1_original_files = [] sample_2_original_files = [] original_file = OriginalFile() original_file.source_url = "first_url" original_file.source_filename = "first_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_1_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_1 original_file_sample_association.save() original_file = OriginalFile() original_file.source_url = "second_url" original_file.source_filename = "second_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_2_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_1 original_file_sample_association.save() original_file = OriginalFile() original_file.source_url = "third_url" original_file.source_filename = "third_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_2_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_2 original_file_sample_association.save() original_file = OriginalFile() original_file.source_url = "fourth_url" original_file.source_filename = "fourth_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_2_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_2 original_file_sample_association.save() survey_job = SurveyJob(source_type="SRA") survey_job.save() surveyor = SraSurveyor(survey_job) surveyor.queue_downloader_job_for_original_files( sample_1_original_files, experiment_object.accession_code ) surveyor.queue_downloader_job_for_original_files( sample_2_original_files, experiment_object.accession_code ) self.assertEqual(DownloaderJob.objects.all().count(), 2)
def test_no_repeat_jobs(self): """Make sure that queue_downloader_jobs queues all expected Downloader jobs for a given experiment. """ # First, create an experiment with two samples associated with it # and create two original files for each of those samples. experiment_object = Experiment() experiment_object.accession_code = "Experiment1" experiment_object.save() sample_object = Sample() sample_object.accession_code = "Sample1" sample_object.platform_accession_code = "Illumina Genome Analyzer" sample_object.platform_accession_name = "Illumina Genome Analyzer" sample_object.technology = "RNA-SEQ" sample_object.manufacturer = "ILLUMINA" sample_object.source_database = "SRA" sample_object.save() original_file_1 = OriginalFile() original_file_1.source_url = "first_url" original_file_1.source_filename = "first_filename" original_file_1.is_downloaded = False original_file_1.has_raw = True original_file_1.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file_1 original_file_sample_association.sample = sample_object original_file_sample_association.save() original_file_2 = OriginalFile() original_file_2.source_url = "second_url" original_file_2.source_filename = "second_filename" original_file_2.is_downloaded = False original_file_2.has_raw = True original_file_2.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file_2 original_file_sample_association.sample = sample_object original_file_sample_association.save() dlj = DownloaderJob() dlj.save() DownloaderJobOriginalFileAssociation( downloader_job=dlj, original_file=original_file_1 ).save() DownloaderJobOriginalFileAssociation( downloader_job=dlj, original_file=original_file_2 ).save() survey_job = SurveyJob(source_type="SRA") survey_job.save() surveyor = SraSurveyor(survey_job) surveyor.queue_downloader_job_for_original_files( [original_file_1, original_file_2], experiment_object.accession_code ) # We made one DownloaderJob in this test, so # queue_downloader_job_for_original_files didn't have anything # to do, so there should still be only one: self.assertEqual(1, DownloaderJob.objects.all().count())
def handle(self, *args, **options): """Refreshes the metadata for all experiments, or experiments from a specific database """ possible_source_databases = ["ARRAY_EXPRESS", "GEO", "SRA"] if options.get("source_database", None) is None: experiments = Experiment.objects.all() elif options["source_database"] in possible_source_databases: source_database = options["source_database"] experiments = Experiment.objects.filter( source_database=source_database) else: logger.error('Invalid source database "{}"'.format( options["source_database"]) + "\nPossible source databases: {}".format(", ".join( possible_source_databases))) sys.exit(1) paginator = PerformantPaginator(experiments, PAGE_SIZE) page = paginator.page() while True: for experiment in page.object_list: logger.debug("Refreshing metadata for an experiment.", experiment=experiment.accession_code) try: if experiment.source_database == "SRA": metadata = SraSurveyor.gather_all_metadata( experiment.samples.first().accession_code) SraSurveyor._apply_metadata_to_experiment( experiment, metadata) elif experiment.source_database == "GEO": gse = GEOparse.get_GEO( experiment.accession_code, destdir="/tmp/management", silent=True, ) GeoSurveyor._apply_metadata_to_experiment( experiment, gse) elif experiment.source_database == "ARRAY_EXPRESS": request_url = EXPERIMENTS_URL + experiment.accession_code experiment_request = utils.requests_retry_session( ).get(request_url, timeout=60) try: parsed_json = experiment_request.json( )["experiments"]["experiment"][0] except KeyError: logger.error( "Remote experiment has no Experiment data!", experiment_accession_code=experiment. accession_code, survey_job=self.survey_job.id, ) continue ArrayExpressSurveyor._apply_metadata_to_experiment( experiment, parsed_json) experiment.save() # If there are any errors, just continue. It's likely that it's # just a problem with this experiment. except Exception: logger.exception( "exception caught while updating metadata for {}". format(experiment.accession_code)) if not page.has_next(): break else: page = paginator.page(page.next_page_number()) # 2000 samples queued up every five minutes should be fast # enough and also not thrash the DB. time.sleep(60 * 5)
def handle(self, *args, **options): """Refreshes the metadata for all samples, or samples from a specific database """ possible_source_databases = ["ARRAY_EXPRESS", "GEO", "SRA"] if options.get("source_database", None) is None: samples = Sample.objects.all() elif options["source_database"] in possible_source_databases: source_database = options["source_database"] samples = Sample.objects.filter(source_database=source_database) else: logger.error('Invalid source database "{}"'.format( options["source_database"]) + "\nPossible source databases: {}".format(", ".join( possible_source_databases))) sys.exit(1) paginator = PerformantPaginator(samples, PAGE_SIZE) page = paginator.page() while True: for sample in samples: logger.debug("Refreshing metadata for a sample.", sample=sample.accession_code) if sample.source_database == "SRA": metadata = SraSurveyor.gather_all_metadata( sample.accession_code) SraSurveyor._apply_harmonized_metadata_to_sample( sample, metadata) elif sample.source_database == "GEO": gse = GEOparse.get_GEO( sample.experiments.first().accession_code, destdir="/tmp/management", how="brief", silent=True, ) preprocessed_samples = harmony.preprocess_geo( gse.gsms.items()) harmonized_samples = harmony.harmonize( preprocessed_samples) GeoSurveyor._apply_harmonized_metadata_to_sample( sample, harmonized_samples[sample.title]) elif sample.source_database == "ARRAY_EXPRESS": SDRF_URL_TEMPLATE = ( "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt" ) sdrf_url = SDRF_URL_TEMPLATE.format( code=sample.experiments.first().accession_code) sdrf_samples = harmony.parse_sdrf(sdrf_url) harmonized_samples = harmony.harmonize(sdrf_samples) ArrayExpressSurveyor._apply_harmonized_metadata_to_sample( sample, harmonized_samples[sample.title]) sample.save() if not page.has_next(): break else: page = paginator.page(page.next_page_number()) # 2000 samples queued up every five minutes should be fast # enough and also not thrash the DB. time.sleep(60 * 5)
def test_metadata_is_gathered_correctly(self): metadata = SraSurveyor.gather_all_metadata("DRR002116") self.assertEqual(metadata["broker_name"], "DDBJ") self.assertEqual(metadata["center_name"], "RIKEN_CDB") self.assertEqual(metadata["ena-base-count"], "158881910957") self.assertEqual(metadata["ena-spot-count"], "1371813555") self.assertEqual(metadata["experiment_accession"], "DRX001563") self.assertEqual( metadata["experiment_design_description"], ("Experiment for mRNAseq of chicken at stage " "HH16 (biological replicate 1)"), ) self.assertEqual( metadata["experiment_title"], ("Illumina HiSeq 2000 sequencing; " "Exp_Gg_HH16_1_embryo_mRNAseq"), ) self.assertEqual( metadata["lab_name"], ( "Group for Morphological Evolution, Center for Developmental " "Biology, Kobe Institute, RIKEN" ), ) self.assertEqual(metadata["library_layout"], "SINGLE") self.assertEqual(metadata["library_name"], "Gg_HH16_1_embryo_mRNAseq") self.assertEqual(metadata["library_selection"], "RANDOM") self.assertEqual(metadata["library_source"], "TRANSCRIPTOMIC") self.assertEqual(metadata["library_strategy"], "RNA-Seq") self.assertEqual(metadata["organism_id"], "9031") self.assertEqual(metadata["organism_name"], "GALLUS GALLUS") self.assertEqual(metadata["platform_instrument_model"], "Illumina HiSeq 2000") self.assertEqual(metadata["read_spec_0_base_coord"], "1") self.assertEqual(metadata["read_spec_0_class"], "Application Read") self.assertEqual(metadata["read_spec_0_index"], "0") self.assertEqual(metadata["read_spec_0_type"], "Forward") self.assertEqual(metadata["run_accession"], "DRR002116") self.assertEqual(metadata["run_center"], "RIKEN_CDB") self.assertEqual(metadata["run_date"], "2011-09-01T00:00:00+09:00") self.assertEqual(metadata["run_ena_base_count"], "3256836000") self.assertEqual(metadata["run_ena_first_public"], "2013-07-19") self.assertEqual(metadata["run_ena_last_update"], "2017-08-11") self.assertEqual(metadata["run_ena_spot_count"], "32568360") self.assertEqual(metadata["sample_accession"], "DRS001521") self.assertEqual(metadata["sample_center_name"], "BioSample") self.assertEqual(metadata["sample_ena_base_count"], "3256836000") self.assertEqual(metadata["sample_ena_first_public"], "2013-07-20") self.assertEqual(metadata["sample_ena_last_update"], "2015-08-24") self.assertEqual(metadata["sample_ena_spot_count"], "32568360") self.assertEqual( metadata["sample_sample_comment"], ("mRNAseq of chicken at stage HH16 (biological " "replicate 1)"), ) self.assertEqual(metadata["sample_sample_name"], "DRS001521") self.assertEqual(metadata["sample_title"], "Gg_HH16_1_embryo_mRNAseq") self.assertEqual(metadata["spot_length"], "100") self.assertEqual(metadata["study_ena_first_public"], "2013-07-19") self.assertEqual(metadata["study_ena_last_update"], "2015-06-22") self.assertEqual(metadata["study_accession"], "DRP000595") self.assertEqual(metadata["submission_accession"], "DRA000567") self.assertEqual( metadata["submission_comment"], ( "Time course gene expression profiles of turtle " "(Pelodiscus sinensis) and chicken (Gallus gallus) " "embryos were examined. Whole transcriptome of turtle " "was also determined by uding stranded sequencing " "methods." ), ) self.assertEqual(metadata["submission_title"], "Submitted by RIKEN_CDB on 19-JUL-2013") ncbi_url = SraSurveyor._build_ncbi_file_url(metadata["run_accession"]) self.assertTrue( ncbi_url in [ "[email protected]:/sra/sra-instant/reads/ByRun/sra/DRR/DRR002/DRR002116/DRR002116.sra", "[email protected]:/sra/sra-instant/reads/ByRun/sra/DRR/DRR002/DRR002116/DRR002116.sra", "[email protected]:data/sracloud/traces/dra0/DRR/000002/DRR002116", ] )