Esempi in Python per harmonize

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: data_refinery_foreman.surveyor.harmony

Metodo/funzione: harmonize

Esempi su hotexamples.com: 6

harmonize in Python: 6 esempi trovati. Questi sono i migliori esempi reali in Python per data_refinery_foreman.surveyor.harmony.harmonize, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

Mostra file

File: sra.py Progetto: arjunkrish/refinebio

 def _apply_harmonized_metadata_to_sample(sample: Sample, metadata: dict):
     """Harmonizes the metadata and applies it to `sample`"""
     sample.title = harmony.extract_title(metadata)
     harmonized_sample = harmony.harmonize([metadata])
     for key, value in harmonized_sample[sample.title].items():
         setattr(sample, key, value)

Esempio n. 2

Mostra file

File: geo.py Progetto: erflynn/refinebio

    def create_experiment_and_samples_from_api(
            self, experiment_accession_code) -> (Experiment, List[Sample]):
        """ The main surveyor - find the Experiment and Samples from NCBI GEO.

        Uses the GEOParse library, for which docs can be found here: https://geoparse.readthedocs.io/en/latest/usage.html#working-with-geo-objects

        """
        # Cleaning up is tracked here: https://github.com/guma44/GEOparse/issues/41
        gse = GEOparse.get_GEO(experiment_accession_code,
                               destdir=self.get_temp_path(),
                               how="brief",
                               silent=True)
        preprocessed_samples = harmony.preprocess_geo(gse.gsms.items())
        harmonized_samples = harmony.harmonize(preprocessed_samples)

        # Create the experiment object
        try:
            experiment_object = Experiment.objects.get(
                accession_code=experiment_accession_code)
            logger.debug(
                "Experiment %s already exists, skipping object creation.",
                experiment_accession_code,
                survey_job=self.survey_job.id,
            )
        except Experiment.DoesNotExist:
            experiment_object = Experiment()
            experiment_object.accession_code = experiment_accession_code
            GeoSurveyor._apply_metadata_to_experiment(experiment_object, gse)
            experiment_object.save()

            experiment_annotation = ExperimentAnnotation()
            experiment_annotation.data = gse.metadata
            experiment_annotation.experiment = experiment_object
            experiment_annotation.is_ccdl = False
            experiment_annotation.save()

        # Okay, here's the situation!
        # Sometimes, samples have a direct single representation for themselves.
        # Othertimes, there is a single file with references to every sample in it.
        created_samples = []
        for sample_accession_code, sample in gse.gsms.items():

            try:
                sample_object = Sample.objects.get(
                    accession_code=sample_accession_code)
                logger.debug(
                    "Sample %s from experiment %s already exists, skipping object creation.",
                    sample_accession_code,
                    experiment_object.accession_code,
                    survey_job=self.survey_job.id,
                )

                # Associate it with the experiment, but since it
                # already exists it already has original files
                # associated with it and it's already been downloaded,
                # so don't add it to created_samples.
                ExperimentSampleAssociation.objects.get_or_create(
                    experiment=experiment_object, sample=sample_object)

                ExperimentOrganismAssociation.objects.get_or_create(
                    experiment=experiment_object,
                    organism=sample_object.organism)
            except Sample.DoesNotExist:
                organism = Organism.get_object_for_name(
                    sample.metadata["organism_ch1"][0].upper())

                sample_object = Sample()
                sample_object.source_database = "GEO"
                sample_object.accession_code = sample_accession_code
                sample_object.organism = organism

                # If data processing step, it isn't raw.
                sample_object.has_raw = not sample.metadata.get(
                    "data_processing", None)

                ExperimentOrganismAssociation.objects.get_or_create(
                    experiment=experiment_object, organism=organism)
                sample_object.title = sample.metadata["title"][0]

                self.set_platform_properties(sample_object, sample.metadata,
                                             gse)

                GeoSurveyor._apply_harmonized_metadata_to_sample(
                    sample_object, harmonized_samples[sample_object.title])

                # Sample-level protocol_info
                sample_object.protocol_info = self.get_sample_protocol_info(
                    sample.metadata, sample_accession_code)

                sample_object.save()
                logger.debug("Created Sample: " + str(sample_object))

                sample_annotation = SampleAnnotation()
                sample_annotation.sample = sample_object
                sample_annotation.data = sample.metadata
                sample_annotation.is_ccdl = False
                sample_annotation.save()

                sample_supplements = sample.metadata.get(
                    "supplementary_file", [])
                for supplementary_file_url in sample_supplements:

                    # Why do they give us this?
                    if supplementary_file_url == "NONE":
                        break

                    # We never want these!
                    if "idat.gz" in supplementary_file_url.lower():
                        continue
                    if "chp.gz" in supplementary_file_url.lower():
                        continue
                    if "ndf.gz" in supplementary_file_url.lower():
                        continue
                    if "pos.gz" in supplementary_file_url.lower():
                        continue
                    if "pair.gz" in supplementary_file_url.lower():
                        continue
                    if "gff.gz" in supplementary_file_url.lower():
                        continue

                    # Sometimes, we are lied to about the data processing step.
                    lower_file_url = supplementary_file_url.lower()
                    if (".cel" in lower_file_url
                            or ("_non_normalized.txt" in lower_file_url)
                            or ("_non-normalized.txt" in lower_file_url)
                            or ("-non-normalized.txt" in lower_file_url)
                            or ("-non_normalized.txt" in lower_file_url)):
                        sample_object.has_raw = True
                        sample_object.save()

                    # filename and source_filename are the same for these
                    filename = FileUtils.get_filename(supplementary_file_url)
                    original_file = OriginalFile.objects.get_or_create(
                        source_url=supplementary_file_url,
                        filename=filename,
                        source_filename=filename,
                        has_raw=sample_object.has_raw,
                        is_archive=FileUtils.is_archive(filename),
                    )[0]

                    logger.debug("Created OriginalFile: " + str(original_file))

                    original_file_sample_association = OriginalFileSampleAssociation.objects.get_or_create(
                        original_file=original_file, sample=sample_object)

                    if original_file.is_affy_data():
                        # Only Affymetrix Microarrays produce .CEL files
                        sample_object.technology = "MICROARRAY"
                        sample_object.manufacturer = "AFFYMETRIX"
                        sample_object.save()

                # It's okay to survey RNA-Seq samples from GEO, but we
                # don't actually want to download/process any RNA-Seq
                # data unless it comes from SRA.
                if sample_object.technology != "RNA-SEQ":
                    created_samples.append(sample_object)

                # Now that we've determined the technology at the
                # sample level, we can set it at the experiment level,
                # just gotta make sure to only do it once. There can
                # be more than one technology, this should be changed
                # as part of:
                # https://github.com/AlexsLemonade/refinebio/issues/1099
                if not experiment_object.technology:
                    experiment_object.technology = sample_object.technology
                    experiment_object.save()

                ExperimentSampleAssociation.objects.get_or_create(
                    experiment=experiment_object, sample=sample_object)

        # These supplementary files _may-or-may-not_ contain the type of raw data we can process.
        for experiment_supplement_url in gse.metadata.get(
                "supplementary_file", []):

            # filename and source_filename are the same for these
            filename = experiment_supplement_url.split("/")[-1]
            original_file = OriginalFile.objects.get_or_create(
                source_url=experiment_supplement_url,
                filename=filename,
                source_filename=filename,
                has_raw=sample_object.has_raw,
                is_archive=True,
            )[0]

            logger.debug("Created OriginalFile: " + str(original_file))

            lower_supplement_url = experiment_supplement_url.lower()
            if (("_non_normalized.txt" in lower_supplement_url)
                    or ("_non-normalized.txt" in lower_supplement_url)
                    or ("-non-normalized.txt" in lower_supplement_url)
                    or ("-non_normalized.txt" in lower_supplement_url)):
                for sample_object in created_samples:
                    sample_object.has_raw = True
                    sample_object.save()

                    OriginalFileSampleAssociation.objects.get_or_create(
                        sample=sample_object, original_file=original_file)

            # Delete this Original file if it isn't being used.
            if (OriginalFileSampleAssociation.objects.filter(
                    original_file=original_file).count() == 0):
                original_file.delete()

        # These are the Miniml/Soft/Matrix URLs that are always(?) provided.
        # GEO describes different types of data formatting as "families"
        family_url = self.get_miniml_url(experiment_accession_code)
        miniml_original_file = OriginalFile.objects.get_or_create(
            source_url=family_url,
            source_filename=family_url.split("/")[-1],
            has_raw=sample_object.has_raw,
            is_archive=True,
        )[0]
        for sample_object in created_samples:
            # We don't need a .txt if we have a .CEL
            if sample_object.has_raw:
                continue
            OriginalFileSampleAssociation.objects.get_or_create(
                sample=sample_object, original_file=miniml_original_file)

        # Delete this Original file if it isn't being used.
        if (OriginalFileSampleAssociation.objects.filter(
                original_file=miniml_original_file).count() == 0):
            miniml_original_file.delete()

        # Trash the temp path
        try:
            shutil.rmtree(self.get_temp_path())
        except Exception:
            # There was a problem during surveying so this didn't get created.
            # It's not a big deal.
            pass

        return experiment_object, created_samples

Esempio n. 3

Mostra file

File: array_express.py Progetto: Quiltomics/refinebio

    def create_samples_from_api(self, experiment: Experiment,
                                platform_dict: Dict) -> List[Sample]:
        """Generates a Sample item for each sample in an AE experiment.

        There are many possible data situations for a sample:

            - If the sample only has raw data available:
                - If it is on a platform that we support:
                    Download this raw data and process it
                - If it is not on a platform we support:
                    Don't download anything, don't process anything
            - If the sample has both raw and derived data:
                - If the raw data is on a platform we support:
                    Download the raw data and process it, abandon the derived data
                - If the raw data is not on a platform we support
                    Download the derived data and no-op it, abandon the raw data
            - If the sample only has derived data:
                Download the derived data and no-op it.

        See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/samples

        """

        created_samples = []

        samples_endpoint = SAMPLES_URL.format(experiment.accession_code)
        r = utils.requests_retry_session().get(samples_endpoint, timeout=60)
        samples = r.json()["experiment"]["sample"]

        # The SDRF is the complete metadata record on a sample/property basis.
        # We run this through our harmonizer and then attach the properties
        # to our created samples.
        SDRF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt"
        sdrf_url = SDRF_URL_TEMPLATE.format(code=experiment.accession_code)
        sdrf_samples = harmony.parse_sdrf(sdrf_url)
        harmonized_samples = harmony.harmonize(sdrf_samples)

        # An experiment can have many samples
        for sample_data in samples:

            # For some reason, this sample has no files associated with it.
            if "file" not in sample_data or len(sample_data['file']) == 0:
                continue

            # Each sample is given an experimenatlly-unique title.
            flat_sample = utils.flatten(sample_data)
            title = harmony.extract_title(flat_sample)

            # A sample may actually have many sub files.
            # If there is raw data, take that.
            # If not, take the derived.
            has_raw = False
            for sub_file in sample_data['file']:

                # For ex: E-GEOD-15645
                if isinstance(sub_file['comment'], list):
                    sub_file_mod = sub_file
                    sub_file_mod['comment'] = sub_file['comment'][0]
                else:
                    sub_file_mod = sub_file

                # Some have the 'data' field, but not the actual data
                # Ex: E-GEOD-9656
                if sub_file_mod['type'] == "data" and sub_file_mod[
                        'comment'].get('value', None) != None:
                    has_raw = True
                if 'raw' in sub_file_mod['comment'].get('value', ''):
                    has_raw = True

            skip_sample = False
            for sub_file in sample_data['file']:

                # Don't get the raw data if it's only a 1-color sample.
                if 'Cy3' in str(sample_data) and 'Cy5' not in str(sample_data):
                    has_raw = False

                # Skip derived data if we have it raw.
                if has_raw and "derived data" in sub_file['type']:
                    continue

                download_url = None
                filename = sub_file["name"]

                # sub_file["comment"] is only a list if there's
                # more than one comment...
                comments = sub_file["comment"]
                if isinstance(comments, list):
                    # Could be: "Derived ArrayExpress Data Matrix FTP
                    # file" or: "ArrayExpress FTP file". If there is
                    # no comment with a name including "FTP file" then
                    # we don't know where to download it so we need to
                    # mark this job as an error. Therefore don't catch
                    # the potential exception where download_url
                    # doesn't get defined.
                    for comment in comments:
                        if "FTP file" in comment["name"]:
                            download_url = comment["value"]
                            break
                else:
                    download_url = comments["value"]

                if not download_url:
                    logger.error(
                        "Sample %s did not specify a download url, skipping.",
                        sample_accession_code,
                        experiment_accession_code=experiment.accession_code,
                        survey_job=self.survey_job.id,
                        sub_file=sub_file)
                    skip_sample = True
                    continue

                if not filename:
                    logger.error(
                        "Sample %s did not specify a filename, skipping.",
                        sample_accession_code,
                        experiment_accession_code=experiment.accession_code,
                        survey_job=self.survey_job.id,
                        sub_file=sub_file)
                    skip_sample = True
                    continue

            if skip_sample:
                continue

            # The accession code is not a simple matter to determine.
            sample_source_name = sample_data["source"].get("name", "")
            sample_assay_name = sample_data["assay"].get("name", "")
            sample_accession_code = self.determine_sample_accession(
                experiment.accession_code, sample_source_name,
                sample_assay_name, filename)

            # Figure out the Organism for this sample
            organism_name = UNKNOWN
            for characteristic in sample_data["characteristic"]:
                if characteristic["category"].upper() == "ORGANISM":
                    organism_name = characteristic["value"].upper()

            if organism_name == UNKNOWN:
                logger.error(
                    "Sample %s did not specify the organism name.",
                    sample_accession_code,
                    experiment_accession_code=experiment.accession_code,
                    survey_job=self.survey_job.id)
                organism = None
                continue
            else:
                organism = Organism.get_object_for_name(organism_name)

            # Create the sample object
            try:
                # Associate it with the experiment, but since it
                # already exists it already has original files
                # associated with it and it's already been downloaded,
                # so don't add it to created_samples.
                sample_object = Sample.objects.get(
                    accession_code=sample_accession_code)

                # If input experiment includes new protocol information,
                # update sample's protocol_info.
                existing_protocols = sample_object.protocol_info
                protocol_info, is_updated = self.update_sample_protocol_info(
                    existing_protocols, experiment.protocol_description,
                    experiment.source_url + '/protocols')
                if is_updated:
                    sample_object.protocol_info = protocol_info
                    sample_obejct.save()

                logger.debug(
                    "Sample %s already exists, skipping object creation.",
                    sample_accession_code,
                    experiment_accession_code=experiment.accession_code,
                    survey_job=self.survey_job.id)
            except Sample.DoesNotExist:
                sample_object = Sample()

                # The basics
                sample_object.source_database = "ARRAY_EXPRESS"
                sample_object.title = title
                sample_object.accession_code = sample_accession_code
                sample_object.source_archive_url = samples_endpoint
                sample_object.organism = organism
                sample_object.platform_name = platform_dict[
                    "platform_accession_name"]
                sample_object.platform_accession_code = platform_dict[
                    "platform_accession_code"]
                sample_object.manufacturer = platform_dict["manufacturer"]
                sample_object.technology = "MICROARRAY"

                protocol_info, is_updated = self.update_sample_protocol_info(
                    existing_protocols=[],
                    experiment_protocol=experiment.protocol_description,
                    protocol_url=experiment.source_url + '/protocols')
                # Do not check is_updated the first time because we must
                # save a list so we can append to it later.
                sample_object.protocol_info = protocol_info

                sample_object.save()

                # Directly assign the harmonized properties
                harmonized_sample = harmonized_samples[title]
                for key, value in harmonized_sample.items():
                    setattr(sample_object, key, value)
                sample_object.save()

                sample_annotation = SampleAnnotation()
                sample_annotation.data = sample_data
                sample_annotation.sample = sample_object
                sample_annotation.is_ccdl = False
                sample_annotation.save()

                original_file = OriginalFile()
                original_file.filename = filename
                original_file.source_filename = filename
                original_file.source_url = download_url
                original_file.is_downloaded = False
                original_file.is_archive = True
                original_file.has_raw = has_raw
                original_file.save()

                original_file_sample_association = OriginalFileSampleAssociation(
                )
                original_file_sample_association.original_file = original_file
                original_file_sample_association.sample = sample_object
                original_file_sample_association.save()

                created_samples.append(sample_object)

                logger.debug(
                    "Created " + str(sample_object),
                    experiment_accession_code=experiment.accession_code,
                    survey_job=self.survey_job.id,
                    sample=sample_object.id)

            # Create associations if they don't already exist
            ExperimentSampleAssociation.objects.get_or_create(
                experiment=experiment, sample=sample_object)

            ExperimentOrganismAssociation.objects.get_or_create(
                experiment=experiment, organism=organism)

        return created_samples

Esempio n. 4

Mostra file

    def _generate_experiment_and_samples(
            self,
            run_accession: str,
            study_accession: str = None) -> (Experiment, List[Sample]):
        """Generates Experiments and Samples for the provided run_accession."""
        metadata = SraSurveyor.gather_all_metadata(run_accession)

        if metadata == {}:
            if study_accession:
                logger.error("Could not discover any metadata for run.",
                             accession=run_accession,
                             study_accession=study_accession)
            else:
                logger.error("Could not discover any metadata for run.",
                             accession=run_accession)
            return (None, None)  # This will cascade properly

        if DOWNLOAD_SOURCE == "ENA":
            if metadata["library_layout"] == "PAIRED":
                files_urls = [
                    SraSurveyor._build_ena_file_url(run_accession, "_1"),
                    SraSurveyor._build_ena_file_url(run_accession, "_2")
                ]
            else:
                files_urls = [SraSurveyor._build_ena_file_url(run_accession)]
        else:
            files_urls = [SraSurveyor._build_ncbi_file_url(run_accession)]

        # Figure out the Organism for this sample
        organism_name = metadata.pop("organism_name", None)
        if not organism_name:
            logger.error("Could not discover organism type for run.",
                         accession=run_accession)
            return (None, None)  # This will cascade properly

        organism_name = organism_name.upper()
        organism = Organism.get_object_for_name(organism_name)

        ##
        # Experiment
        ##

        experiment_accession_code = metadata.get('study_accession')
        try:
            experiment_object = Experiment.objects.get(
                accession_code=experiment_accession_code)
            logger.debug(
                "Experiment already exists, skipping object creation.",
                experiment_accession_code=experiment_accession_code,
                survey_job=self.survey_job.id)
        except Experiment.DoesNotExist:
            experiment_object = Experiment()
            experiment_object.accession_code = experiment_accession_code
            experiment_object.source_url = ENA_URL_TEMPLATE.format(
                experiment_accession_code)
            experiment_object.source_database = "SRA"
            experiment_object.technology = "RNA-SEQ"

            # We don't get this value from the API, unfortunately.
            # experiment_object.platform_accession_code = experiment["platform_accession_code"]

            if not experiment_object.description:
                experiment_object.description = "No description."

            if "study_title" in metadata:
                experiment_object.title = metadata["study_title"]
            if "study_abstract" in metadata:
                experiment_object.description = metadata["study_abstract"]
            if "lab_name" in metadata:
                experiment_object.submitter_institution = metadata["lab_name"]
            if "experiment_design_description" in metadata:
                experiment_object.protocol_description = metadata[
                    "experiment_design_description"]
            if "pubmed_id" in metadata:
                experiment_object.pubmed_id = metadata["pubmed_id"]
                experiment_object.has_publication = True
            if "study_ena_first_public" in metadata:
                experiment_object.source_first_published = parse_datetime(
                    metadata["study_ena_first_public"])
            if "study_ena_last_update" in metadata:
                experiment_object.source_last_modified = parse_datetime(
                    metadata["study_ena_last_update"])

            # Rare, but it happens.
            if not experiment_object.protocol_description:
                experiment_object.protocol_description = metadata.get(
                    "library_construction_protocol",
                    "Protocol was never provided.")
            # Scrape publication title and authorship from Pubmed
            if experiment_object.pubmed_id:
                pubmed_metadata = utils.get_title_and_authors_for_pubmed_id(
                    experiment_object.pubmed_id)
                experiment_object.publication_title = pubmed_metadata[0]
                experiment_object.publication_authors = pubmed_metadata[1]

            experiment_object.save()

            ##
            # Experiment Metadata
            ##
            json_xa = ExperimentAnnotation()
            json_xa.experiment = experiment_object
            json_xa.data = metadata
            json_xa.is_ccdl = False
            json_xa.save()

        ##
        # Samples
        ##

        sample_accession_code = metadata.pop('run_accession')
        # Create the sample object
        try:
            sample_object = Sample.objects.get(
                accession_code=sample_accession_code)
            # If current experiment includes new protocol information,
            # merge it into the sample's existing protocol_info.
            protocol_info, is_updated = self.update_sample_protocol_info(
                sample_object.protocol_info,
                experiment_object.protocol_description,
                experiment_object.source_url)
            if is_updated:
                sample_object.protocol_info = protocol_info
                sample_object.save()

            logger.debug(
                "Sample %s already exists, skipping object creation.",
                sample_accession_code,
                experiment_accession_code=experiment_object.accession_code,
                survey_job=self.survey_job.id)
        except Sample.DoesNotExist:
            sample_object = Sample()
            sample_object.source_database = "SRA"
            sample_object.accession_code = sample_accession_code
            sample_object.organism = organism

            sample_object.platform_name = metadata.get(
                "platform_instrument_model", "UNKNOWN")
            # The platform_name is human readable and contains spaces,
            # accession codes shouldn't have spaces though:
            sample_object.platform_accession_code = sample_object.platform_name.replace(
                " ", "")
            sample_object.technology = "RNA-SEQ"
            if "ILLUMINA" in sample_object.platform_name.upper() \
            or "NEXTSEQ" in sample_object.platform_name.upper():
                sample_object.manufacturer = "ILLUMINA"
            elif "ION TORRENT" in sample_object.platform_name.upper():
                sample_object.manufacturer = "ION_TORRENT"
            else:
                sample_object.manufacturer = "UNKNOWN"

            # Directly apply the harmonized values
            sample_object.title = harmony.extract_title(metadata)
            harmonized_sample = harmony.harmonize([metadata])
            for key, value in harmonized_sample.items():
                setattr(sample_object, key, value)

            protocol_info, is_updated = self.update_sample_protocol_info(
                existing_protocols=[],
                experiment_protocol=experiment_object.protocol_description,
                experiment_url=experiment_object.source_url)
            # Do not check is_updated the first time because we must
            # save a list so we can append to it later.
            sample_object.protocol_info = protocol_info

            sample_object.save()

            for file_url in files_urls:
                original_file = OriginalFile.objects.get_or_create(
                    source_url=file_url,
                    source_filename=file_url.split('/')[-1],
                    has_raw=True)[0]
                original_file_sample_association = OriginalFileSampleAssociation.objects.get_or_create(
                    original_file=original_file, sample=sample_object)

        # Create associations if they don't already exist
        ExperimentSampleAssociation.objects.get_or_create(
            experiment=experiment_object, sample=sample_object)

        ExperimentOrganismAssociation.objects.get_or_create(
            experiment=experiment_object, organism=organism)

        return experiment_object, [sample_object]

Esempio n. 5

Mostra file

File: geo.py Progetto: modulexcite/refinebio

    def create_experiment_and_samples_from_api(
            self, experiment_accession_code) -> (Experiment, List[Sample]):
        """ The main surveyor - find the Experiment and Samples from NCBI GEO.

        Uses the GEOParse library, for which docs can be found here: https://geoparse.readthedocs.io/en/latest/usage.html#working-with-geo-objects

        """
        # Cleaning up is tracked here: https://github.com/guma44/GEOparse/issues/41
        gse = GEOparse.get_GEO(experiment_accession_code,
                               destdir=self.get_temp_path(),
                               how="brief",
                               silent=True)
        preprocessed_samples = harmony.preprocess_geo(gse.gsms.items())
        harmonized_samples = harmony.harmonize(preprocessed_samples)

        # Create the experiment object
        try:
            experiment_object = Experiment.objects.get(
                accession_code=experiment_accession_code)
            logger.debug(
                "Experiment %s already exists, skipping object creation.",
                experiment_accession_code,
                survey_job=self.survey_job.id)
        except Experiment.DoesNotExist:
            experiment_object = Experiment()
            experiment_object.accession_code = experiment_accession_code
            experiment_object.source_url = (
                "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=" +
                experiment_accession_code)
            experiment_object.source_database = "GEO"
            experiment_object.title = gse.metadata.get('title', [''])[0]
            experiment_object.description = gse.metadata.get('summary',
                                                             [''])[0]

            # Source doesn't provide time information, assume midnight.
            submission_date = gse.metadata["submission_date"][
                0] + " 00:00:00 UTC"
            experiment_object.source_first_published = dateutil.parser.parse(
                submission_date)
            last_updated_date = gse.metadata["last_update_date"][
                0] + " 00:00:00 UTC"
            experiment_object.source_last_updated = dateutil.parser.parse(
                last_updated_date)

            unique_institutions = list(set(gse.metadata["contact_institute"]))
            experiment_object.submitter_institution = ", ".join(
                unique_institutions)
            experiment_object.pubmed_id = gse.metadata.get("pubmed_id",
                                                           [""])[0]

            # Scrape publication title and authorship from Pubmed
            if experiment_object.pubmed_id:
                pubmed_metadata = utils.get_title_and_authors_for_pubmed_id(
                    experiment_object.pubmed_id)
                experiment_object.publication_title = pubmed_metadata[0]
                experiment_object.publication_authors = pubmed_metadata[1]

            experiment_object.save()

            experiment_annotation = ExperimentAnnotation()
            experiment_annotation.data = gse.metadata
            experiment_annotation.experiment = experiment_object
            experiment_annotation.is_ccdl = False
            experiment_annotation.save()

        # Okay, here's the situation!
        # Sometimes, samples have a direct single representation for themselves.
        # Othertimes, there is a single file with references to every sample in it.
        created_samples = []
        for sample_accession_code, sample in gse.gsms.items():

            try:
                sample_object = Sample.objects.get(
                    accession_code=sample_accession_code)
                logger.debug(
                    "Sample %s from experiment %s already exists, skipping object creation.",
                    sample_accession_code,
                    experiment_object.accession_code,
                    survey_job=self.survey_job.id)

                # Associate it with the experiment, but since it
                # already exists it already has original files
                # associated with it and it's already been downloaded,
                # so don't add it to created_samples.
                ExperimentSampleAssociation.objects.get_or_create(
                    experiment=experiment_object, sample=sample_object)

                ExperimentOrganismAssociation.objects.get_or_create(
                    experiment=experiment_object,
                    organism=sample_object.organism)
            except Sample.DoesNotExist:
                organism = Organism.get_object_for_name(
                    sample.metadata['organism_ch1'][0].upper())

                sample_object = Sample()
                sample_object.source_database = "GEO"
                sample_object.accession_code = sample_accession_code
                sample_object.organism = organism

                # If data processing step, it isn't raw.
                sample_object.has_raw = not sample.metadata.get(
                    'data_processing', None)

                ExperimentOrganismAssociation.objects.get_or_create(
                    experiment=experiment_object, organism=organism)
                sample_object.title = sample.metadata['title'][0]

                self.set_platform_properties(sample_object, sample.metadata,
                                             gse)

                # Directly assign the harmonized properties
                harmonized_sample = harmonized_samples[sample_object.title]
                for key, value in harmonized_sample.items():
                    setattr(sample_object, key, value)

                # Sample-level protocol_info
                sample_object.protocol_info = self.get_sample_protocol_info(
                    sample.metadata, sample_accession_code)

                sample_object.save()
                logger.debug("Created Sample: " + str(sample_object))

                sample_annotation = SampleAnnotation()
                sample_annotation.sample = sample_object
                sample_annotation.data = sample.metadata
                sample_annotation.is_ccdl = False
                sample_annotation.save()

                sample_supplements = sample.metadata.get(
                    'supplementary_file', [])
                for supplementary_file_url in sample_supplements:

                    # Why do they give us this?
                    if supplementary_file_url == "NONE":
                        break

                    # We never want these!
                    if "idat.gz" in supplementary_file_url.lower():
                        continue
                    if "chp.gz" in supplementary_file_url.lower():
                        continue
                    if "ndf.gz" in supplementary_file_url.lower():
                        continue
                    if "pos.gz" in supplementary_file_url.lower():
                        continue
                    if "pair.gz" in supplementary_file_url.lower():
                        continue
                    if "gff.gz" in supplementary_file_url.lower():
                        continue

                    # Sometimes, we are lied to about the data processing step.
                    lower_file_url = supplementary_file_url.lower()
                    if '.cel' in lower_file_url \
                    or ('_non_normalized.txt' in lower_file_url) \
                    or ('_non-normalized.txt' in lower_file_url) \
                    or ('-non-normalized.txt' in lower_file_url) \
                    or ('-non_normalized.txt' in lower_file_url):
                        sample_object.has_raw = True
                        sample_object.save()

                    # filename and source_filename are the same for these
                    filename = supplementary_file_url.split('/')[-1]
                    original_file = OriginalFile.objects.get_or_create(
                        source_url=supplementary_file_url,
                        filename=filename,
                        source_filename=filename,
                        has_raw=sample_object.has_raw,
                        is_archive=True)[0]

                    logger.debug("Created OriginalFile: " + str(original_file))

                    original_file_sample_association = OriginalFileSampleAssociation.objects.get_or_create(
                        original_file=original_file, sample=sample_object)

                    if original_file.is_affy_data():
                        # Only Affymetrix Microarrays produce .CEL files
                        sample_object.technology = 'MICROARRAY'
                        sample_object.manufacturer = 'AFFYMETRTIX'
                        sample_object.save()

                # It's okay to survey RNA-Seq samples from GEO, but we
                # don't actually want to download/process any RNA-Seq
                # data unless it comes from SRA.
                if sample_object.technology != 'RNA-SEQ':
                    created_samples.append(sample_object)

                # Now that we've determined the technology at the
                # sample level, we can set it at the experiment level,
                # just gotta make sure to only do it once. There can
                # be more than one technology, this should be changed
                # as part of:
                # https://github.com/AlexsLemonade/refinebio/issues/1099
                if not experiment_object.technology:
                    experiment_object.technology = sample_object.technology
                    experiment_object.save()

                ExperimentSampleAssociation.objects.get_or_create(
                    experiment=experiment_object, sample=sample_object)

        # These supplementary files _may-or-may-not_ contain the type of raw data we can process.
        for experiment_supplement_url in gse.metadata.get(
                'supplementary_file', []):

            # filename and source_filename are the same for these
            filename = experiment_supplement_url.split('/')[-1]
            original_file = OriginalFile.objects.get_or_create(
                source_url=experiment_supplement_url,
                filename=filename,
                source_filename=filename,
                has_raw=sample_object.has_raw,
                is_archive=True)[0]

            logger.debug("Created OriginalFile: " + str(original_file))

            lower_supplement_url = experiment_supplement_url.lower()
            if ('_non_normalized.txt' in lower_supplement_url) \
            or ('_non-normalized.txt' in lower_supplement_url) \
            or ('-non-normalized.txt' in lower_supplement_url) \
            or ('-non_normalized.txt' in lower_supplement_url):
                for sample_object in created_samples:
                    sample_object.has_raw = True
                    sample_object.save()

                    OriginalFileSampleAssociation.objects.get_or_create(
                        sample=sample_object, original_file=original_file)

            # Delete this Original file if it isn't being used.
            if OriginalFileSampleAssociation.objects.filter(
                    original_file=original_file).count() == 0:
                original_file.delete()

        # These are the Miniml/Soft/Matrix URLs that are always(?) provided.
        # GEO describes different types of data formatting as "families"
        family_url = self.get_miniml_url(experiment_accession_code)
        miniml_original_file = OriginalFile.objects.get_or_create(
            source_url=family_url,
            source_filename=family_url.split('/')[-1],
            has_raw=sample_object.has_raw,
            is_archive=True)[0]
        for sample_object in created_samples:
            # We don't need a .txt if we have a .CEL
            if sample_object.has_raw:
                continue
            OriginalFileSampleAssociation.objects.get_or_create(
                sample=sample_object, original_file=miniml_original_file)

        # Delete this Original file if it isn't being used.
        if OriginalFileSampleAssociation.objects.filter(
                original_file=miniml_original_file).count() == 0:
            miniml_original_file.delete()

        # Trash the temp path
        try:
            shutil.rmtree(self.get_temp_path())
        except Exception:
            # There was a problem during surveying so this didn't get created.
            # It's not a big deal.
            pass

        return experiment_object, created_samples

Esempio n. 6

Mostra file

File: update_sample_metadata.py Progetto: erflynn/refinebio

    def handle(self, *args, **options):
        """Refreshes the metadata for all samples, or samples from a specific database
        """
        possible_source_databases = ["ARRAY_EXPRESS", "GEO", "SRA"]

        if options.get("source_database", None) is None:
            samples = Sample.objects.all()
        elif options["source_database"] in possible_source_databases:
            source_database = options["source_database"]
            samples = Sample.objects.filter(source_database=source_database)
        else:
            logger.error('Invalid source database "{}"'.format(
                options["source_database"]) +
                         "\nPossible source databases: {}".format(", ".join(
                             possible_source_databases)))
            sys.exit(1)

        paginator = PerformantPaginator(samples, PAGE_SIZE)
        page = paginator.page()

        while True:
            for sample in samples:
                logger.debug("Refreshing metadata for a sample.",
                             sample=sample.accession_code)
                if sample.source_database == "SRA":
                    metadata = SraSurveyor.gather_all_metadata(
                        sample.accession_code)
                    SraSurveyor._apply_harmonized_metadata_to_sample(
                        sample, metadata)
                elif sample.source_database == "GEO":
                    gse = GEOparse.get_GEO(
                        sample.experiments.first().accession_code,
                        destdir="/tmp/management",
                        how="brief",
                        silent=True,
                    )
                    preprocessed_samples = harmony.preprocess_geo(
                        gse.gsms.items())
                    harmonized_samples = harmony.harmonize(
                        preprocessed_samples)
                    GeoSurveyor._apply_harmonized_metadata_to_sample(
                        sample, harmonized_samples[sample.title])
                elif sample.source_database == "ARRAY_EXPRESS":
                    SDRF_URL_TEMPLATE = (
                        "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt"
                    )
                    sdrf_url = SDRF_URL_TEMPLATE.format(
                        code=sample.experiments.first().accession_code)
                    sdrf_samples = harmony.parse_sdrf(sdrf_url)
                    harmonized_samples = harmony.harmonize(sdrf_samples)
                    ArrayExpressSurveyor._apply_harmonized_metadata_to_sample(
                        sample, harmonized_samples[sample.title])

                sample.save()

            if not page.has_next():
                break
            else:
                page = paginator.page(page.next_page_number())

            # 2000 samples queued up every five minutes should be fast
            # enough and also not thrash the DB.
            time.sleep(60 * 5)