def test_geo_survey_microarray(self, mock_send_task): """ Run the GEO surveyor and make sure we get some files to DL! For an Illumina Microarray platform. """ self.prep_test("GSE11915") geo_surveyor = GeoSurveyor(self.survey_job) geo_surveyor.survey() self.assertEqual(34, Sample.objects.all().count()) sample_object = Sample.objects.first() self.assertEqual(sample_object.platform_name, "[HG-U133A] Affymetrix Human Genome U133A Array") self.assertEqual(sample_object.platform_accession_code, "hgu133a") self.assertEqual(sample_object.technology, "MICROARRAY") # Confirm sample protocol_info GSM299800 = Sample.objects.get(accession_code="GSM299800") protocol_info = GSM299800.protocol_info self.assertEqual(protocol_info['Extraction protocol'], [ 'Chromatin IP performed as described in Odom et al., Science 303, 1378 (Feb 27, 2004)' ]) self.assertEqual(protocol_info['Data processing'], ['Z-score normalization']) downloader_jobs = DownloaderJob.objects.all() self.assertEqual(45, downloader_jobs.count()) # Make sure there aren't extra OriginalFiles original_files = OriginalFile.objects.all() self.assertEqual(45, original_files.count())
def test_geo_survey_agilent(self, mock_send_task): """ Run the GEO surveyor and make sure we get some files to DL! For an Agilent Microarray platform. """ self.prep_test("GSE35186") geo_surveyor = GeoSurveyor(self.survey_job) geo_surveyor.survey() self.assertEqual(124, Sample.objects.all().count()) sample_object = Sample.objects.first() self.assertEqual( sample_object.platform_name, "Agilent-014850 Whole Human Genome Microarray 4x44K G4112F (Probe Name version)" ) self.assertEqual(sample_object.platform_accession_code, "GPL6480") # We currently do not support Agilent platforms, so we can't # match its accession to one we know about. self.assertEqual(sample_object.technology, "UNKNOWN") downloader_jobs = DownloaderJob.objects.all() # There would be 124 samples + 2 metadata files. However at # the moment Agilent is unsupported so we don't want to queue # downloader jobs. self.assertEqual(0, downloader_jobs.count())
def test_geo_survey_rnaseq(self, mock_send_task): """ Run the GEO surveyor and make sure we get some files to DL! For an Illumina RNASeq platform. """ self.prep_test("GSE99264") geo_surveyor = GeoSurveyor(self.survey_job) geo_surveyor.survey() self.assertEqual(7, Sample.objects.all().count()) sample_object = Sample.objects.first() self.assertEqual(sample_object.platform_name, "Illumina Genome Analyzer II") self.assertEqual(sample_object.platform_accession_code, "Illumina Genome Analyzer II") self.assertEqual(sample_object.technology, "RNA-SEQ") downloader_jobs = DownloaderJob.objects.all() self.assertEqual(1, downloader_jobs.count()) # Make sure there aren't extra OriginalFiles original_files = OriginalFile.objects.all() self.assertEqual(1, original_files.count())
def test_geo_survey_superseries(self, mock_send_task): """Run the GEO surveyor and make sure we get some files to DL! For a Super Series. But also that we don't queue downloader jobs for RNA-Seq samples coming from GEO. """ self.prep_test("GSE103217") geo_surveyor = GeoSurveyor(self.survey_job) geo_surveyor.survey() # 28 total samples self.assertEqual(28, Sample.objects.all().count()) # 10 of which are microarray and therefore need downloader jobs microarray_samples = Sample.objects.filter(technology='MICROARRAY') self.assertEqual(10, microarray_samples.count()) downloader_jobs = DownloaderJob.objects.all() self.assertEqual(10, downloader_jobs.count()) # And 18 of which are RNA-Seq so they won't have downloader jobs. rna_seq_samples = Sample.objects.filter(technology='RNA-SEQ') self.assertEqual(18, rna_seq_samples.count()) # Make sure there aren't extra OriginalFiles original_files = OriginalFile.objects.all() self.assertEqual(10, original_files.count())
def test_geo_survey_not_agilent(self, mock_send_task): """ Test to make sure we're setting MFG correctly """ self.prep_test("GSE34198") geo_surveyor = GeoSurveyor(self.survey_job) geo_surveyor.survey() sample_object = Sample.objects.first() self.assertEqual(sample_object.manufacturer, "ILLUMINA")
def _get_surveyor_for_source(survey_job: SurveyJob): """Factory method for ExternalSourceSurveyors.""" if survey_job.source_type == "ARRAY_EXPRESS": return ArrayExpressSurveyor(survey_job) if survey_job.source_type == "SRA": return SraSurveyor(survey_job) if survey_job.source_type == "TRANSCRIPTOME_INDEX": return TranscriptomeIndexSurveyor(survey_job) if survey_job.source_type == "GEO": return GeoSurveyor(survey_job) else: raise SourceNotSupportedError( "Source " + survey_job.source_type + " is not supported.")
def test_geo_survey_rnaseq(self, mock_send_task): """Run the GEO surveyor and make sure we discover the experiment/samples. For an Illumina RNASeq platform. However it shouldn't actually queue any downloader jobs because its RNA-Seq data coming from GEO. """ self.prep_test("GSE99264") geo_surveyor = GeoSurveyor(self.survey_job) geo_surveyor.survey() self.assertEqual(7, Sample.objects.all().count()) sample_object = Sample.objects.first() self.assertEqual(sample_object.platform_name, "Illumina Genome Analyzer II") self.assertEqual(sample_object.platform_accession_code, "Illumina Genome Analyzer II") self.assertEqual(sample_object.technology, "RNA-SEQ") downloader_jobs = DownloaderJob.objects.all() self.assertEqual(0, downloader_jobs.count())
def handle(self, *args, **options): """Refreshes the metadata for all experiments, or experiments from a specific database """ possible_source_databases = ["ARRAY_EXPRESS", "GEO", "SRA"] if options.get("source_database", None) is None: experiments = Experiment.objects.all() elif options["source_database"] in possible_source_databases: source_database = options["source_database"] experiments = Experiment.objects.filter( source_database=source_database) else: logger.error('Invalid source database "{}"'.format( options["source_database"]) + "\nPossible source databases: {}".format(", ".join( possible_source_databases))) sys.exit(1) paginator = PerformantPaginator(experiments, PAGE_SIZE) page = paginator.page() while True: for experiment in page.object_list: logger.debug("Refreshing metadata for an experiment.", experiment=experiment.accession_code) try: if experiment.source_database == "SRA": metadata = SraSurveyor.gather_all_metadata( experiment.samples.first().accession_code) SraSurveyor._apply_metadata_to_experiment( experiment, metadata) elif experiment.source_database == "GEO": gse = GEOparse.get_GEO( experiment.accession_code, destdir="/tmp/management", silent=True, ) GeoSurveyor._apply_metadata_to_experiment( experiment, gse) elif experiment.source_database == "ARRAY_EXPRESS": request_url = EXPERIMENTS_URL + experiment.accession_code experiment_request = utils.requests_retry_session( ).get(request_url, timeout=60) try: parsed_json = experiment_request.json( )["experiments"]["experiment"][0] except KeyError: logger.error( "Remote experiment has no Experiment data!", experiment_accession_code=experiment. accession_code, survey_job=self.survey_job.id, ) continue ArrayExpressSurveyor._apply_metadata_to_experiment( experiment, parsed_json) experiment.save() # If there are any errors, just continue. It's likely that it's # just a problem with this experiment. except Exception: logger.exception( "exception caught while updating metadata for {}". format(experiment.accession_code)) if not page.has_next(): break else: page = paginator.page(page.next_page_number()) # 2000 samples queued up every five minutes should be fast # enough and also not thrash the DB. time.sleep(60 * 5)
def test_geo_survey_microarray(self, mock_send_task): """Test that the unsurveyor works correctly. This includes not deleting samples which also belong to other experiments. Therefore we survey a superseries and one of its sub-experiments, then delete the superseries to make sure the sub-experiment wasn't touched. We mock out the send_job function so that we don't actually process these. The unsurveyor code related to ComputedFile, ComputationalResult, and ProcessorJobs won't be tested by this, but it's been functionally tested. """ superseries_accession = "GSE59795" sub_experiment_accession = "GSE46580" # Survey the superseries. survey_job = SurveyJob(source_type="GEO") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value=superseries_accession) key_value_pair.save() geo_surveyor = GeoSurveyor(survey_job) geo_surveyor.survey() # Survey the sub-experiment survey_job = SurveyJob(source_type="GEO") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value=sub_experiment_accession) key_value_pair.save() geo_surveyor = GeoSurveyor(survey_job) geo_surveyor.survey() # Establish baselines before purge experiment = Experiment.objects.filter(accession_code=sub_experiment_accession)[0] experiment_sample_assocs = ExperimentSampleAssociation.objects.filter(experiment=experiment) samples = Sample.objects.filter(id__in=experiment_sample_assocs.values('sample_id')) self.assertEqual(samples.count(), 4) og_file_sample_assocs = OriginalFileSampleAssociation.objects.filter(sample_id__in=samples.values('id')) original_files = OriginalFile.objects.filter(id__in=og_file_sample_assocs.values('original_file_id')) self.assertEqual(original_files.count(), 4) experiment = Experiment.objects.filter(accession_code=superseries_accession)[0] experiment_sample_assocs = ExperimentSampleAssociation.objects.filter(experiment=experiment) samples = Sample.objects.filter(id__in=experiment_sample_assocs.values('sample_id')) self.assertEqual(samples.count(), 20) og_file_sample_assocs = OriginalFileSampleAssociation.objects.filter(sample_id__in=samples.values('id')) original_files = OriginalFile.objects.filter(id__in=og_file_sample_assocs.values('original_file_id')) self.assertEqual(original_files.count(), 20) # Purge the superseries purge_experiment(superseries_accession) # Make sure the subexperiment samples weren't affected. experiment = Experiment.objects.filter(accession_code=sub_experiment_accession)[0] experiment_sample_assocs = ExperimentSampleAssociation.objects.filter(experiment=experiment) samples = Sample.objects.filter(id__in=experiment_sample_assocs.values('sample_id')) self.assertEqual(samples.count(), 4) # Make sure sub-experiment original files weren't affected. og_file_sample_assocs = OriginalFileSampleAssociation.objects.filter(sample_id__in=samples.values('id')) original_files = OriginalFile.objects.filter(id__in=og_file_sample_assocs.values('original_file_id')) self.assertEqual(original_files.count(), 4) # And that samples and files that remain are from the subseries. self.assertEqual(Sample.objects.count(), 4) self.assertEqual(OriginalFile.objects.count(), 4)
def handle(self, *args, **options): """Refreshes the metadata for all samples, or samples from a specific database """ possible_source_databases = ["ARRAY_EXPRESS", "GEO", "SRA"] if options.get("source_database", None) is None: samples = Sample.objects.all() elif options["source_database"] in possible_source_databases: source_database = options["source_database"] samples = Sample.objects.filter(source_database=source_database) else: logger.error('Invalid source database "{}"'.format( options["source_database"]) + "\nPossible source databases: {}".format(", ".join( possible_source_databases))) sys.exit(1) paginator = PerformantPaginator(samples, PAGE_SIZE) page = paginator.page() while True: for sample in samples: logger.debug("Refreshing metadata for a sample.", sample=sample.accession_code) if sample.source_database == "SRA": metadata = SraSurveyor.gather_all_metadata( sample.accession_code) SraSurveyor._apply_harmonized_metadata_to_sample( sample, metadata) elif sample.source_database == "GEO": gse = GEOparse.get_GEO( sample.experiments.first().accession_code, destdir="/tmp/management", how="brief", silent=True, ) preprocessed_samples = harmony.preprocess_geo( gse.gsms.items()) harmonized_samples = harmony.harmonize( preprocessed_samples) GeoSurveyor._apply_harmonized_metadata_to_sample( sample, harmonized_samples[sample.title]) elif sample.source_database == "ARRAY_EXPRESS": SDRF_URL_TEMPLATE = ( "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt" ) sdrf_url = SDRF_URL_TEMPLATE.format( code=sample.experiments.first().accession_code) sdrf_samples = harmony.parse_sdrf(sdrf_url) harmonized_samples = harmony.harmonize(sdrf_samples) ArrayExpressSurveyor._apply_harmonized_metadata_to_sample( sample, harmonized_samples[sample.title]) sample.save() if not page.has_next(): break else: page = paginator.page(page.next_page_number()) # 2000 samples queued up every five minutes should be fast # enough and also not thrash the DB. time.sleep(60 * 5)