Example #1
0
    def test_smasher_job(self):
        for accession_code in SMASHER_EXPERIMENTS:
            purge_experiment(accession_code)

        prepare_computed_files()

        # The API sometimes takes a bit to come back up.
        start_time = timezone.now()
        while True:
            try:
                pyrefinebio.create_token(agree_to_terms=True, save_token=False)
                break
            except pyrefinebio.ServerError:
                if timezone.now() - start_time > timedelta(minutes=15):
                    raise AssertionError("Server not up after 15 minutes")
                else:
                    sleep(30)

        dataset_path = "end_to_end_test_dataset"
        pyrefinebio.download_dataset(
            dataset_path,
            "*****@*****.**",
            dataset_dict={
                "GSE1487313": ["GSM1487313"],
                "SRP332914": ["SRR332914"]
            },
            timeout=timedelta(minutes=15),
            prompt=False,
        )
        self.assertTrue(path.exists(dataset_path))
Example #2
0
    def test_all_the_things(self):
        for accession_code in EXPERIMENT_ACCESSION_CODES:
            purge_experiment(accession_code)

        self.process_experiments()

        self.check_transcriptome_index()

        self.create_qn_reference()

        self.create_compendia()
Example #3
0
    def test_no_op(self):
        """Survey, download, then process an experiment we know is NO_OP."""
        # Clear out pre-existing work dirs so there's no conflicts:

        self.env = EnvironmentVarGuard()
        self.env.set('RUNING_IN_CLOUD', 'False')
        with self.env:
            for work_dir in glob.glob(LOCAL_ROOT_DIR + "/processor_job_*"):
                shutil.rmtree(work_dir)

            # Make sure there are no already existing jobs we might poll for unsuccessfully.
            DownloaderJobOriginalFileAssociation.objects.all().delete()
            DownloaderJob.objects.all().delete()
            ProcessorJobOriginalFileAssociation.objects.all().delete()
            ProcessorJob.objects.all().delete()

            # Prevent a call being made to NCBI's API to determine
            # organism name/id.
            organism = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
            organism.save()

            accession_code = "E-GEOD-3303"
            survey_job = surveyor.survey_experiment(accession_code,
                                                    "ARRAY_EXPRESS")

            self.assertTrue(survey_job.success)

            downloader_jobs = DownloaderJob.objects.all()
            self.assertGreater(downloader_jobs.count(), 0)

            logger.info(
                "Survey Job finished, waiting for Downloader Jobs to complete."
            )
            start_time = timezone.now()
            for downloader_job in downloader_jobs:
                downloader_job = wait_for_job(downloader_job, DownloaderJob,
                                              start_time)
                self.assertTrue(downloader_job.success)

            processor_jobs = ProcessorJob.objects.all()
            self.assertGreater(processor_jobs.count(), 0)

            logger.info(
                "Downloader Jobs finished, waiting for processor Jobs to complete."
            )
            start_time = timezone.now()
            for processor_job in processor_jobs:
                processor_job = wait_for_job(processor_job, ProcessorJob,
                                             start_time)
                self.assertTrue(processor_job.success)

            # Test that the unsurveyor deletes all objects related to the experiment
            purge_experiment(accession_code)

            self.assertEqual(Experiment.objects.all().count(), 0)
            self.assertEqual(ExperimentAnnotation.objects.all().count(), 0)
            self.assertEqual(ExperimentSampleAssociation.objects.all().count(),
                             0)
            self.assertEqual(Sample.objects.all().count(), 0)
            self.assertEqual(SampleAnnotation.objects.all().count(), 0)
            self.assertEqual(OriginalFile.objects.all().count(), 0)
            self.assertEqual(
                OriginalFileSampleAssociation.objects.all().count(), 0)
            self.assertEqual(SampleResultAssociation.objects.all().count(), 0)
            self.assertEqual(ComputationalResult.objects.all().count(), 0)
            self.assertEqual(
                ComputationalResultAnnotation.objects.all().count(), 0)
            self.assertEqual(
                SampleComputedFileAssociation.objects.all().count(), 0)
            self.assertEqual(ComputedFile.objects.all().count(), 0)
            self.assertEqual(DownloaderJob.objects.all().count(), 0)
            self.assertEqual(
                DownloaderJobOriginalFileAssociation.objects.all().count(), 0)
            self.assertEqual(ProcessorJob.objects.all().count(), 0)
            self.assertEqual(
                ProcessorJobOriginalFileAssociation.objects.all().count(), 0)
Example #4
0
    def handle(self, *args, **options):
        """Re-surveys GEO experiments containing samples with incorrect platform information.
        """
        # Check against CDF corrected accessions table to prevent recorrection of the same samples.
        corrected_experiments = CdfCorrectedAccession.objects.all().values(
            "accession_code")

        gse_experiments = Experiment.objects.filter(
            source_database="GEO").exclude(
                accession_code__in=corrected_experiments)

        paginator = Paginator(gse_experiments, PAGE_SIZE)
        page = paginator.page()

        while True:
            for experiment in page.object_list:
                try:
                    gse = GEOparse.get_GEO(experiment.accession_code,
                                           destdir=GEO_TEMP_DIR,
                                           how="brief",
                                           silent=True)

                    sample_accessions = list(gse.gsms.keys())
                    samples = Sample.objects.filter(
                        accession_code__in=sample_accessions)

                    wrong_platform = False
                    for sample in samples:
                        gpl = gse.gsms[
                            sample.accession_code].metadata["platform_id"][0]
                        internal_accession = get_internal_microarray_accession(
                            gpl)
                        if internal_accession != sample.platform_accession_code:
                            wrong_platform = True
                            break

                    if wrong_platform:
                        if options["dry_run"]:
                            logger.info(
                                "Would have re-surveyed experiment with accession code %s",
                                experiment.accession_code,
                            )
                        else:
                            logger.info(
                                "Re-surveying experiment with accession code %s",
                                experiment.accession_code,
                            )

                            purge_experiment(experiment.accession_code)

                            queue_surveyor_for_accession(
                                experiment.accession_code)

                    current_time = timezone.now()
                    CdfCorrectedAccession(
                        accession_code=experiment.accession_code,
                        created_at=current_time).save()
                except Exception:
                    logger.exception("Caught an exception with %s!",
                                     experiment.accession_code)
                finally:
                    # GEOparse downloads files here and never cleans them up! Grrrr!
                    download_path = GEO_TEMP_DIR + experiment.accession_code + "_family.soft.gz"
                    # It's not a directory, but ignore_errors is useful.
                    try:
                        os.remove(download_path)
                    except Exception:
                        # Don't anything interrupt this, like say,
                        # GEOParse downloading a directory instead of
                        # a file...
                        logger.exception("Failed to delete an archive.")

            if not page.has_next():
                break

            page = paginator.page(page.next_page_number())
Example #5
0
    def test_geo_survey_microarray(self, mock_send_task):
        """Test that the unsurveyor works correctly.

        This includes not deleting samples which also belong to other
        experiments. Therefore we survey a superseries and one of its
        sub-experiments, then delete the superseries to make sure the
        sub-experiment wasn't touched.

        We mock out the send_job function so that we don't actually
        process these. The unsurveyor code related to ComputedFile,
        ComputationalResult, and ProcessorJobs won't be tested by
        this, but it's been functionally tested.
        """
        superseries_accession = "GSE59795"
        sub_experiment_accession = "GSE46580"

        # Survey the superseries.
        survey_job = SurveyJob(source_type="GEO")
        survey_job.save()

        key_value_pair = SurveyJobKeyValue(survey_job=survey_job,
                                           key="experiment_accession_code",
                                           value=superseries_accession)
        key_value_pair.save()

        geo_surveyor = GeoSurveyor(survey_job)
        geo_surveyor.survey()

        # Survey the sub-experiment
        survey_job = SurveyJob(source_type="GEO")
        survey_job.save()

        key_value_pair = SurveyJobKeyValue(survey_job=survey_job,
                                           key="experiment_accession_code",
                                           value=sub_experiment_accession)
        key_value_pair.save()

        geo_surveyor = GeoSurveyor(survey_job)
        geo_surveyor.survey()

        # Establish baselines before purge
        experiment = Experiment.objects.filter(accession_code=sub_experiment_accession)[0]
        experiment_sample_assocs = ExperimentSampleAssociation.objects.filter(experiment=experiment)
        samples = Sample.objects.filter(id__in=experiment_sample_assocs.values('sample_id'))
        self.assertEqual(samples.count(), 4)

        og_file_sample_assocs = OriginalFileSampleAssociation.objects.filter(sample_id__in=samples.values('id'))
        original_files = OriginalFile.objects.filter(id__in=og_file_sample_assocs.values('original_file_id'))
        self.assertEqual(original_files.count(), 4)

        experiment = Experiment.objects.filter(accession_code=superseries_accession)[0]
        experiment_sample_assocs = ExperimentSampleAssociation.objects.filter(experiment=experiment)
        samples = Sample.objects.filter(id__in=experiment_sample_assocs.values('sample_id'))
        self.assertEqual(samples.count(), 20)

        og_file_sample_assocs = OriginalFileSampleAssociation.objects.filter(sample_id__in=samples.values('id'))
        original_files = OriginalFile.objects.filter(id__in=og_file_sample_assocs.values('original_file_id'))
        self.assertEqual(original_files.count(), 20)

        # Purge the superseries
        purge_experiment(superseries_accession)

        # Make sure the subexperiment samples weren't affected.
        experiment = Experiment.objects.filter(accession_code=sub_experiment_accession)[0]
        experiment_sample_assocs = ExperimentSampleAssociation.objects.filter(experiment=experiment)
        samples = Sample.objects.filter(id__in=experiment_sample_assocs.values('sample_id'))
        self.assertEqual(samples.count(), 4)

        # Make sure sub-experiment original files weren't affected.
        og_file_sample_assocs = OriginalFileSampleAssociation.objects.filter(sample_id__in=samples.values('id'))
        original_files = OriginalFile.objects.filter(id__in=og_file_sample_assocs.values('original_file_id'))
        self.assertEqual(original_files.count(), 4)

        # And that samples and files that remain are from the subseries.
        self.assertEqual(Sample.objects.count(), 4)
        self.assertEqual(OriginalFile.objects.count(), 4)