def test_smasher_job(self): for accession_code in SMASHER_EXPERIMENTS: purge_experiment(accession_code) prepare_computed_files() # The API sometimes takes a bit to come back up. start_time = timezone.now() while True: try: pyrefinebio.create_token(agree_to_terms=True, save_token=False) break except pyrefinebio.ServerError: if timezone.now() - start_time > timedelta(minutes=15): raise AssertionError("Server not up after 15 minutes") else: sleep(30) dataset_path = "end_to_end_test_dataset" pyrefinebio.download_dataset( dataset_path, "*****@*****.**", dataset_dict={ "GSE1487313": ["GSM1487313"], "SRP332914": ["SRR332914"] }, timeout=timedelta(minutes=15), prompt=False, ) self.assertTrue(path.exists(dataset_path))
def test_all_the_things(self): for accession_code in EXPERIMENT_ACCESSION_CODES: purge_experiment(accession_code) self.process_experiments() self.check_transcriptome_index() self.create_qn_reference() self.create_compendia()
def test_no_op(self): """Survey, download, then process an experiment we know is NO_OP.""" # Clear out pre-existing work dirs so there's no conflicts: self.env = EnvironmentVarGuard() self.env.set('RUNING_IN_CLOUD', 'False') with self.env: for work_dir in glob.glob(LOCAL_ROOT_DIR + "/processor_job_*"): shutil.rmtree(work_dir) # Make sure there are no already existing jobs we might poll for unsuccessfully. DownloaderJobOriginalFileAssociation.objects.all().delete() DownloaderJob.objects.all().delete() ProcessorJobOriginalFileAssociation.objects.all().delete() ProcessorJob.objects.all().delete() # Prevent a call being made to NCBI's API to determine # organism name/id. organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) organism.save() accession_code = "E-GEOD-3303" survey_job = surveyor.survey_experiment(accession_code, "ARRAY_EXPRESS") self.assertTrue(survey_job.success) downloader_jobs = DownloaderJob.objects.all() self.assertGreater(downloader_jobs.count(), 0) logger.info( "Survey Job finished, waiting for Downloader Jobs to complete." ) start_time = timezone.now() for downloader_job in downloader_jobs: downloader_job = wait_for_job(downloader_job, DownloaderJob, start_time) self.assertTrue(downloader_job.success) processor_jobs = ProcessorJob.objects.all() self.assertGreater(processor_jobs.count(), 0) logger.info( "Downloader Jobs finished, waiting for processor Jobs to complete." ) start_time = timezone.now() for processor_job in processor_jobs: processor_job = wait_for_job(processor_job, ProcessorJob, start_time) self.assertTrue(processor_job.success) # Test that the unsurveyor deletes all objects related to the experiment purge_experiment(accession_code) self.assertEqual(Experiment.objects.all().count(), 0) self.assertEqual(ExperimentAnnotation.objects.all().count(), 0) self.assertEqual(ExperimentSampleAssociation.objects.all().count(), 0) self.assertEqual(Sample.objects.all().count(), 0) self.assertEqual(SampleAnnotation.objects.all().count(), 0) self.assertEqual(OriginalFile.objects.all().count(), 0) self.assertEqual( OriginalFileSampleAssociation.objects.all().count(), 0) self.assertEqual(SampleResultAssociation.objects.all().count(), 0) self.assertEqual(ComputationalResult.objects.all().count(), 0) self.assertEqual( ComputationalResultAnnotation.objects.all().count(), 0) self.assertEqual( SampleComputedFileAssociation.objects.all().count(), 0) self.assertEqual(ComputedFile.objects.all().count(), 0) self.assertEqual(DownloaderJob.objects.all().count(), 0) self.assertEqual( DownloaderJobOriginalFileAssociation.objects.all().count(), 0) self.assertEqual(ProcessorJob.objects.all().count(), 0) self.assertEqual( ProcessorJobOriginalFileAssociation.objects.all().count(), 0)
def handle(self, *args, **options): """Re-surveys GEO experiments containing samples with incorrect platform information. """ # Check against CDF corrected accessions table to prevent recorrection of the same samples. corrected_experiments = CdfCorrectedAccession.objects.all().values( "accession_code") gse_experiments = Experiment.objects.filter( source_database="GEO").exclude( accession_code__in=corrected_experiments) paginator = Paginator(gse_experiments, PAGE_SIZE) page = paginator.page() while True: for experiment in page.object_list: try: gse = GEOparse.get_GEO(experiment.accession_code, destdir=GEO_TEMP_DIR, how="brief", silent=True) sample_accessions = list(gse.gsms.keys()) samples = Sample.objects.filter( accession_code__in=sample_accessions) wrong_platform = False for sample in samples: gpl = gse.gsms[ sample.accession_code].metadata["platform_id"][0] internal_accession = get_internal_microarray_accession( gpl) if internal_accession != sample.platform_accession_code: wrong_platform = True break if wrong_platform: if options["dry_run"]: logger.info( "Would have re-surveyed experiment with accession code %s", experiment.accession_code, ) else: logger.info( "Re-surveying experiment with accession code %s", experiment.accession_code, ) purge_experiment(experiment.accession_code) queue_surveyor_for_accession( experiment.accession_code) current_time = timezone.now() CdfCorrectedAccession( accession_code=experiment.accession_code, created_at=current_time).save() except Exception: logger.exception("Caught an exception with %s!", experiment.accession_code) finally: # GEOparse downloads files here and never cleans them up! Grrrr! download_path = GEO_TEMP_DIR + experiment.accession_code + "_family.soft.gz" # It's not a directory, but ignore_errors is useful. try: os.remove(download_path) except Exception: # Don't anything interrupt this, like say, # GEOParse downloading a directory instead of # a file... logger.exception("Failed to delete an archive.") if not page.has_next(): break page = paginator.page(page.next_page_number())
def test_geo_survey_microarray(self, mock_send_task): """Test that the unsurveyor works correctly. This includes not deleting samples which also belong to other experiments. Therefore we survey a superseries and one of its sub-experiments, then delete the superseries to make sure the sub-experiment wasn't touched. We mock out the send_job function so that we don't actually process these. The unsurveyor code related to ComputedFile, ComputationalResult, and ProcessorJobs won't be tested by this, but it's been functionally tested. """ superseries_accession = "GSE59795" sub_experiment_accession = "GSE46580" # Survey the superseries. survey_job = SurveyJob(source_type="GEO") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value=superseries_accession) key_value_pair.save() geo_surveyor = GeoSurveyor(survey_job) geo_surveyor.survey() # Survey the sub-experiment survey_job = SurveyJob(source_type="GEO") survey_job.save() key_value_pair = SurveyJobKeyValue(survey_job=survey_job, key="experiment_accession_code", value=sub_experiment_accession) key_value_pair.save() geo_surveyor = GeoSurveyor(survey_job) geo_surveyor.survey() # Establish baselines before purge experiment = Experiment.objects.filter(accession_code=sub_experiment_accession)[0] experiment_sample_assocs = ExperimentSampleAssociation.objects.filter(experiment=experiment) samples = Sample.objects.filter(id__in=experiment_sample_assocs.values('sample_id')) self.assertEqual(samples.count(), 4) og_file_sample_assocs = OriginalFileSampleAssociation.objects.filter(sample_id__in=samples.values('id')) original_files = OriginalFile.objects.filter(id__in=og_file_sample_assocs.values('original_file_id')) self.assertEqual(original_files.count(), 4) experiment = Experiment.objects.filter(accession_code=superseries_accession)[0] experiment_sample_assocs = ExperimentSampleAssociation.objects.filter(experiment=experiment) samples = Sample.objects.filter(id__in=experiment_sample_assocs.values('sample_id')) self.assertEqual(samples.count(), 20) og_file_sample_assocs = OriginalFileSampleAssociation.objects.filter(sample_id__in=samples.values('id')) original_files = OriginalFile.objects.filter(id__in=og_file_sample_assocs.values('original_file_id')) self.assertEqual(original_files.count(), 20) # Purge the superseries purge_experiment(superseries_accession) # Make sure the subexperiment samples weren't affected. experiment = Experiment.objects.filter(accession_code=sub_experiment_accession)[0] experiment_sample_assocs = ExperimentSampleAssociation.objects.filter(experiment=experiment) samples = Sample.objects.filter(id__in=experiment_sample_assocs.values('sample_id')) self.assertEqual(samples.count(), 4) # Make sure sub-experiment original files weren't affected. og_file_sample_assocs = OriginalFileSampleAssociation.objects.filter(sample_id__in=samples.values('id')) original_files = OriginalFile.objects.filter(id__in=og_file_sample_assocs.values('original_file_id')) self.assertEqual(original_files.count(), 4) # And that samples and files that remain are from the subseries. self.assertEqual(Sample.objects.count(), 4) self.assertEqual(OriginalFile.objects.count(), 4)