Ejemplo n.º 1
0
    def test_geo_survey_microarray(self, mock_send_task):
        """ Run the GEO surveyor and make sure we get some files to DL!

        For an Illumina Microarray platform.
        """
        self.prep_test("GSE11915")

        geo_surveyor = GeoSurveyor(self.survey_job)
        geo_surveyor.survey()

        self.assertEqual(34, Sample.objects.all().count())

        sample_object = Sample.objects.first()
        self.assertEqual(sample_object.platform_name,
                         "[HG-U133A] Affymetrix Human Genome U133A Array")
        self.assertEqual(sample_object.platform_accession_code, "hgu133a")
        self.assertEqual(sample_object.technology, "MICROARRAY")

        # Confirm sample protocol_info
        GSM299800 = Sample.objects.get(accession_code="GSM299800")
        protocol_info = GSM299800.protocol_info
        self.assertEqual(protocol_info['Extraction protocol'], [
            'Chromatin IP performed as described in Odom et al., Science 303, 1378 (Feb 27, 2004)'
        ])
        self.assertEqual(protocol_info['Data processing'],
                         ['Z-score normalization'])

        downloader_jobs = DownloaderJob.objects.all()
        self.assertEqual(45, downloader_jobs.count())

        # Make sure there aren't extra OriginalFiles
        original_files = OriginalFile.objects.all()
        self.assertEqual(45, original_files.count())
Ejemplo n.º 2
0
    def test_geo_survey_agilent(self, mock_send_task):
        """ Run the GEO surveyor and make sure we get some files to DL!

        For an Agilent Microarray platform.
        """
        self.prep_test("GSE35186")

        geo_surveyor = GeoSurveyor(self.survey_job)
        geo_surveyor.survey()

        self.assertEqual(124, Sample.objects.all().count())

        sample_object = Sample.objects.first()
        self.assertEqual(
            sample_object.platform_name,
            "Agilent-014850 Whole Human Genome Microarray 4x44K G4112F (Probe Name version)"
        )
        self.assertEqual(sample_object.platform_accession_code, "GPL6480")
        # We currently do not support Agilent platforms, so we can't
        # match its accession to one we know about.
        self.assertEqual(sample_object.technology, "UNKNOWN")

        downloader_jobs = DownloaderJob.objects.all()
        # There would be 124 samples + 2 metadata files. However at
        # the moment Agilent is unsupported so we don't want to queue
        # downloader jobs.
        self.assertEqual(0, downloader_jobs.count())
Ejemplo n.º 3
0
    def test_geo_survey_rnaseq(self, mock_send_task):
        """ Run the GEO surveyor and make sure we get some files to DL!

        For an Illumina RNASeq platform.
        """
        self.prep_test("GSE99264")

        geo_surveyor = GeoSurveyor(self.survey_job)
        geo_surveyor.survey()

        self.assertEqual(7, Sample.objects.all().count())

        sample_object = Sample.objects.first()
        self.assertEqual(sample_object.platform_name,
                         "Illumina Genome Analyzer II")
        self.assertEqual(sample_object.platform_accession_code,
                         "Illumina Genome Analyzer II")
        self.assertEqual(sample_object.technology, "RNA-SEQ")

        downloader_jobs = DownloaderJob.objects.all()
        self.assertEqual(1, downloader_jobs.count())

        # Make sure there aren't extra OriginalFiles
        original_files = OriginalFile.objects.all()
        self.assertEqual(1, original_files.count())
Ejemplo n.º 4
0
    def test_geo_survey_superseries(self, mock_send_task):
        """Run the GEO surveyor and make sure we get some files to DL!

        For a Super Series. But also that we don't queue downloader
        jobs for RNA-Seq samples coming from GEO.
        """
        self.prep_test("GSE103217")

        geo_surveyor = GeoSurveyor(self.survey_job)
        geo_surveyor.survey()

        # 28 total samples
        self.assertEqual(28, Sample.objects.all().count())

        # 10 of which are microarray and therefore need downloader jobs
        microarray_samples = Sample.objects.filter(technology='MICROARRAY')
        self.assertEqual(10, microarray_samples.count())
        downloader_jobs = DownloaderJob.objects.all()
        self.assertEqual(10, downloader_jobs.count())

        # And 18 of which are RNA-Seq so they won't have downloader jobs.
        rna_seq_samples = Sample.objects.filter(technology='RNA-SEQ')
        self.assertEqual(18, rna_seq_samples.count())

        # Make sure there aren't extra OriginalFiles
        original_files = OriginalFile.objects.all()
        self.assertEqual(10, original_files.count())
Ejemplo n.º 5
0
    def test_geo_survey_not_agilent(self, mock_send_task):
        """ Test to make sure we're setting MFG correctly
        """
        self.prep_test("GSE34198")

        geo_surveyor = GeoSurveyor(self.survey_job)
        geo_surveyor.survey()

        sample_object = Sample.objects.first()
        self.assertEqual(sample_object.manufacturer, "ILLUMINA")
Ejemplo n.º 6
0
def _get_surveyor_for_source(survey_job: SurveyJob):
    """Factory method for ExternalSourceSurveyors."""
    if survey_job.source_type == "ARRAY_EXPRESS":
        return ArrayExpressSurveyor(survey_job)
    if survey_job.source_type == "SRA":
        return SraSurveyor(survey_job)
    if survey_job.source_type == "TRANSCRIPTOME_INDEX":
        return TranscriptomeIndexSurveyor(survey_job)
    if survey_job.source_type == "GEO":
        return GeoSurveyor(survey_job)
    else:
        raise SourceNotSupportedError(
            "Source " + survey_job.source_type + " is not supported.")
Ejemplo n.º 7
0
    def test_geo_survey_rnaseq(self, mock_send_task):
        """Run the GEO surveyor and make sure we discover the experiment/samples.

        For an Illumina RNASeq platform. However it shouldn't actually
        queue any downloader jobs because its RNA-Seq data coming from
        GEO.
        """
        self.prep_test("GSE99264")

        geo_surveyor = GeoSurveyor(self.survey_job)
        geo_surveyor.survey()

        self.assertEqual(7, Sample.objects.all().count())

        sample_object = Sample.objects.first()
        self.assertEqual(sample_object.platform_name,
                         "Illumina Genome Analyzer II")
        self.assertEqual(sample_object.platform_accession_code,
                         "Illumina Genome Analyzer II")
        self.assertEqual(sample_object.technology, "RNA-SEQ")

        downloader_jobs = DownloaderJob.objects.all()
        self.assertEqual(0, downloader_jobs.count())
    def handle(self, *args, **options):
        """Refreshes the metadata for all experiments, or experiments from a specific database
        """
        possible_source_databases = ["ARRAY_EXPRESS", "GEO", "SRA"]

        if options.get("source_database", None) is None:
            experiments = Experiment.objects.all()
        elif options["source_database"] in possible_source_databases:
            source_database = options["source_database"]
            experiments = Experiment.objects.filter(
                source_database=source_database)
        else:
            logger.error('Invalid source database "{}"'.format(
                options["source_database"]) +
                         "\nPossible source databases: {}".format(", ".join(
                             possible_source_databases)))
            sys.exit(1)

        paginator = PerformantPaginator(experiments, PAGE_SIZE)
        page = paginator.page()

        while True:
            for experiment in page.object_list:
                logger.debug("Refreshing metadata for an experiment.",
                             experiment=experiment.accession_code)
                try:
                    if experiment.source_database == "SRA":
                        metadata = SraSurveyor.gather_all_metadata(
                            experiment.samples.first().accession_code)
                        SraSurveyor._apply_metadata_to_experiment(
                            experiment, metadata)

                    elif experiment.source_database == "GEO":
                        gse = GEOparse.get_GEO(
                            experiment.accession_code,
                            destdir="/tmp/management",
                            silent=True,
                        )

                        GeoSurveyor._apply_metadata_to_experiment(
                            experiment, gse)

                    elif experiment.source_database == "ARRAY_EXPRESS":
                        request_url = EXPERIMENTS_URL + experiment.accession_code
                        experiment_request = utils.requests_retry_session(
                        ).get(request_url, timeout=60)
                        try:
                            parsed_json = experiment_request.json(
                            )["experiments"]["experiment"][0]
                        except KeyError:
                            logger.error(
                                "Remote experiment has no Experiment data!",
                                experiment_accession_code=experiment.
                                accession_code,
                                survey_job=self.survey_job.id,
                            )
                            continue
                        ArrayExpressSurveyor._apply_metadata_to_experiment(
                            experiment, parsed_json)

                    experiment.save()

                # If there are any errors, just continue. It's likely that it's
                # just a problem with this experiment.
                except Exception:
                    logger.exception(
                        "exception caught while updating metadata for {}".
                        format(experiment.accession_code))

            if not page.has_next():
                break
            else:
                page = paginator.page(page.next_page_number())

            # 2000 samples queued up every five minutes should be fast
            # enough and also not thrash the DB.
            time.sleep(60 * 5)
Ejemplo n.º 9
0
    def test_geo_survey_microarray(self, mock_send_task):
        """Test that the unsurveyor works correctly.

        This includes not deleting samples which also belong to other
        experiments. Therefore we survey a superseries and one of its
        sub-experiments, then delete the superseries to make sure the
        sub-experiment wasn't touched.

        We mock out the send_job function so that we don't actually
        process these. The unsurveyor code related to ComputedFile,
        ComputationalResult, and ProcessorJobs won't be tested by
        this, but it's been functionally tested.
        """
        superseries_accession = "GSE59795"
        sub_experiment_accession = "GSE46580"

        # Survey the superseries.
        survey_job = SurveyJob(source_type="GEO")
        survey_job.save()

        key_value_pair = SurveyJobKeyValue(survey_job=survey_job,
                                           key="experiment_accession_code",
                                           value=superseries_accession)
        key_value_pair.save()

        geo_surveyor = GeoSurveyor(survey_job)
        geo_surveyor.survey()

        # Survey the sub-experiment
        survey_job = SurveyJob(source_type="GEO")
        survey_job.save()

        key_value_pair = SurveyJobKeyValue(survey_job=survey_job,
                                           key="experiment_accession_code",
                                           value=sub_experiment_accession)
        key_value_pair.save()

        geo_surveyor = GeoSurveyor(survey_job)
        geo_surveyor.survey()

        # Establish baselines before purge
        experiment = Experiment.objects.filter(accession_code=sub_experiment_accession)[0]
        experiment_sample_assocs = ExperimentSampleAssociation.objects.filter(experiment=experiment)
        samples = Sample.objects.filter(id__in=experiment_sample_assocs.values('sample_id'))
        self.assertEqual(samples.count(), 4)

        og_file_sample_assocs = OriginalFileSampleAssociation.objects.filter(sample_id__in=samples.values('id'))
        original_files = OriginalFile.objects.filter(id__in=og_file_sample_assocs.values('original_file_id'))
        self.assertEqual(original_files.count(), 4)

        experiment = Experiment.objects.filter(accession_code=superseries_accession)[0]
        experiment_sample_assocs = ExperimentSampleAssociation.objects.filter(experiment=experiment)
        samples = Sample.objects.filter(id__in=experiment_sample_assocs.values('sample_id'))
        self.assertEqual(samples.count(), 20)

        og_file_sample_assocs = OriginalFileSampleAssociation.objects.filter(sample_id__in=samples.values('id'))
        original_files = OriginalFile.objects.filter(id__in=og_file_sample_assocs.values('original_file_id'))
        self.assertEqual(original_files.count(), 20)

        # Purge the superseries
        purge_experiment(superseries_accession)

        # Make sure the subexperiment samples weren't affected.
        experiment = Experiment.objects.filter(accession_code=sub_experiment_accession)[0]
        experiment_sample_assocs = ExperimentSampleAssociation.objects.filter(experiment=experiment)
        samples = Sample.objects.filter(id__in=experiment_sample_assocs.values('sample_id'))
        self.assertEqual(samples.count(), 4)

        # Make sure sub-experiment original files weren't affected.
        og_file_sample_assocs = OriginalFileSampleAssociation.objects.filter(sample_id__in=samples.values('id'))
        original_files = OriginalFile.objects.filter(id__in=og_file_sample_assocs.values('original_file_id'))
        self.assertEqual(original_files.count(), 4)

        # And that samples and files that remain are from the subseries.
        self.assertEqual(Sample.objects.count(), 4)
        self.assertEqual(OriginalFile.objects.count(), 4)
Ejemplo n.º 10
0
    def handle(self, *args, **options):
        """Refreshes the metadata for all samples, or samples from a specific database
        """
        possible_source_databases = ["ARRAY_EXPRESS", "GEO", "SRA"]

        if options.get("source_database", None) is None:
            samples = Sample.objects.all()
        elif options["source_database"] in possible_source_databases:
            source_database = options["source_database"]
            samples = Sample.objects.filter(source_database=source_database)
        else:
            logger.error('Invalid source database "{}"'.format(
                options["source_database"]) +
                         "\nPossible source databases: {}".format(", ".join(
                             possible_source_databases)))
            sys.exit(1)

        paginator = PerformantPaginator(samples, PAGE_SIZE)
        page = paginator.page()

        while True:
            for sample in samples:
                logger.debug("Refreshing metadata for a sample.",
                             sample=sample.accession_code)
                if sample.source_database == "SRA":
                    metadata = SraSurveyor.gather_all_metadata(
                        sample.accession_code)
                    SraSurveyor._apply_harmonized_metadata_to_sample(
                        sample, metadata)
                elif sample.source_database == "GEO":
                    gse = GEOparse.get_GEO(
                        sample.experiments.first().accession_code,
                        destdir="/tmp/management",
                        how="brief",
                        silent=True,
                    )
                    preprocessed_samples = harmony.preprocess_geo(
                        gse.gsms.items())
                    harmonized_samples = harmony.harmonize(
                        preprocessed_samples)
                    GeoSurveyor._apply_harmonized_metadata_to_sample(
                        sample, harmonized_samples[sample.title])
                elif sample.source_database == "ARRAY_EXPRESS":
                    SDRF_URL_TEMPLATE = (
                        "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt"
                    )
                    sdrf_url = SDRF_URL_TEMPLATE.format(
                        code=sample.experiments.first().accession_code)
                    sdrf_samples = harmony.parse_sdrf(sdrf_url)
                    harmonized_samples = harmony.harmonize(sdrf_samples)
                    ArrayExpressSurveyor._apply_harmonized_metadata_to_sample(
                        sample, harmonized_samples[sample.title])

                sample.save()

            if not page.has_next():
                break
            else:
                page = paginator.page(page.next_page_number())

            # 2000 samples queued up every five minutes should be fast
            # enough and also not thrash the DB.
            time.sleep(60 * 5)