Esempi in Python per Model.addSameIndividual

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: dipper.models.Model

Classe/tipologia: Model

Metodo/funzione: addSameIndividual

Esempi su hotexamples.com: 23

Model.addSameIndividual in Python: 23 esempi trovati. Questi sono i migliori esempi reali in Python per dipper.models.Model.Model.addSameIndividual, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

Model(30)

addDescription(30)

addIndividualToGraph(30)

addClassToGraph(27)

addEquivalentClass(26)

addXref(23)

addType(23)

addSynonym(18)

addLabel(14)

makeLeader(14)

addBlankNodeAnnotation(10)

addTriple(9)

addSameIndividual(9)

addDeprecatedClass(8)

addSubClass(8)

addDefinition(8)

addDeprecatedIndividual(6)

addOWLPropertyClassRestriction(4)

addDepiction(3)

addComment(3)

_addSexSpecificity(2)

addOWLVersionIRI(2)

addOWLVersionInfo(2)

addOntologyDeclaration(2)

addPerson(2)

Esempio n. 1

Mostra file

File: NCBIGene.py Progetto: monarch-initiative/dipper

    def _add_gene_equivalencies(self, xrefs, gene_id, taxon):
        """
        Add equivalentClass and sameAs relationships

        Uses external resource map located in
        /resources/clique_leader.yaml to determine
        if an NCBITaxon ID space is a clique leader
        """

        clique_map = self.open_and_parse_yaml(self.resources['clique_leader'])

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport']

        # deal with the dbxrefs
        # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696

        for dbxref in xrefs.strip().split('|'):
            prefix = ':'.join(dbxref.split(':')[:-1]).strip()
            if prefix in self.localtt:
                prefix = self.localtt[prefix]
            dbxref_curie = ':'.join((prefix, dbxref.split(':')[-1]))

            if dbxref_curie is not None and prefix != '':
                if prefix == 'HPRD':  # proteins are not == genes.
                    model.addTriple(
                        gene_id, self.globaltt['has gene product'], dbxref_curie)
                    continue
                    # skip some of these for now based on curie prefix
                if prefix in filter_out:
                    continue

                if prefix == 'ENSEMBL':
                    model.addXref(gene_id, dbxref_curie)
                if prefix == 'OMIM':
                    if dbxref_curie in self.omim_replaced:
                        repl = self.omim_replaced[dbxref_curie]
                        for omim in repl:
                            if omim in self.omim_type and \
                                    self.omim_type[omim] == self.globaltt['gene']:
                                dbxref_curie = omim
                    if dbxref_curie in self.omim_type and \
                            self.omim_type[dbxref_curie] != self.globaltt['gene']:
                        continue
                try:
                    if self.class_or_indiv.get(gene_id) == 'C':
                        model.addEquivalentClass(gene_id, dbxref_curie)
                        if taxon in clique_map:
                            if clique_map[taxon] == prefix:
                                model.makeLeader(dbxref_curie)
                            elif clique_map[taxon] == gene_id.split(':')[0]:
                                model.makeLeader(gene_id)
                    else:
                        model.addSameIndividual(gene_id, dbxref_curie)
                except AssertionError as err:
                    LOG.warning("Error parsing %s: %s", gene_id, err)

Esempio n. 2

Mostra file

File: NCBIGene.py Progetto: kshefchek/dipper

    def _add_gene_equivalencies(self, xrefs, gene_id, taxon):
        """
        Add equivalentClass and sameAs relationships

        Uses external resource map located in
        /resources/clique_leader.yaml to determine
        if an ID space is a clique leader
        """

        clique_map = self.open_and_parse_yaml(self.resources['clique_leader'])

        if self.testMode:
            graph = self.testgraph
        else:
            graph = self.graph

        filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport']
        taxon_spec_filters = {
            '10090': ['ENSEMBL']
        }
        if taxon in taxon_spec_filters:
            filter_out += taxon_spec_filters[taxon]

        model = Model(graph)
        # deal with the xrefs
        # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696
        for ref in xrefs.strip().split('|'):
            xref_curie = self._cleanup_id(ref)
            if xref_curie is not None and xref_curie.strip() != '':
                if re.match(r'HPRD', xref_curie):
                    # proteins are not == genes.
                    model.addTriple(
                        gene_id,
                        self.properties['has_gene_product'], xref_curie)
                    continue
                    # skip some of these for now
                if xref_curie.split(':')[0] in filter_out:
                    continue
                if re.match(r'^OMIM', xref_curie):
                    if DipperUtil.is_omim_disease(xref_curie):
                        continue
                try:
                    if self.class_or_indiv.get(gene_id) == 'C':
                        model.addEquivalentClass(
                            gene_id, xref_curie)
                        if int(taxon) in clique_map:
                            if clique_map[int(taxon)] == xref_curie.split(':')[0]:
                                model.makeLeader(xref_curie)
                            elif clique_map[int(taxon)] == gene_id.split(':')[0]:
                                model.makeLeader(gene_id)
                    else:
                        model.addSameIndividual(gene_id, xref_curie)
                except AssertionError as e:
                    logger.warn("Error parsing {0}: {1}".format(gene_id, e))
        return

Esempio n. 3

Mostra file

File: WormBase.py Progetto: kshefchek/dipper

    def process_pub_xrefs(self, limit=None):

        raw = '/'.join((self.rawdir, self.files['pub_xrefs']['file']))

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)
        logger.info("Processing publication xrefs")
        line_counter = 0
        with open(raw, 'r') as csvfile:
            filereader = csv.reader(csvfile, delimiter='\t', quotechar='\"')
            for row in filereader:
                line_counter += 1
                (wb_ref, xref) = row
                # WBPaper00000009 pmid8805<BR>
                # WBPaper00000011 doi10.1139/z78-244<BR>
                # WBPaper00000012 cgc12<BR>

                if self.testMode and wb_ref not in self.test_ids['pub']:
                    continue

                ref_id = 'WormBase:'+wb_ref
                xref_id = None
                r = None
                xref = re.sub(r'<BR>', '', xref)
                xref = xref.strip()
                if re.match(r'pmid', xref):
                    xref_id = 'PMID:'+re.sub(r'pmid\s*', '', xref)
                    reference = Reference(
                        g, xref_id, Reference.ref_types['journal_article'])
                elif re.search(r'[\(\)\<\>\[\]\s]', xref):
                    continue
                elif re.match(r'doi', xref):
                    xref_id = 'DOI:'+re.sub(r'doi', '', xref.strip())
                    reference = Reference(g, xref_id)
                elif re.match(r'cgc', xref):
                    # TODO not sure what to do here with cgc xrefs
                    continue
                else:
                    # logger.debug("Other xrefs like %s", xref)
                    continue

                if xref_id is not None:
                    reference.addRefToGraph()
                    model.addSameIndividual(ref_id, xref_id)

                if not self.testMode \
                        and limit is not None and line_counter > limit:
                    break

        return

Esempio n. 4

Mostra file

File: MPD.py Progetto: DoctorBud/dipper

    def _process_straininfo(self, limit):
        # line_counter = 0  # TODO unused
        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph
        model = Model(g)

        logger.info("Processing measurements ...")
        raw = '/'.join((self.rawdir, self.files['straininfo']['file']))

        tax_id = 'NCBITaxon:10090'

        with open(raw, 'r') as f:
            reader = csv.reader(f, delimiter=',', quotechar='\"')
            self.check_header(self.files['straininfo']['file'], f.readline())
            for row in reader:
                (strain_name, vendor, stocknum, panel, mpd_strainid,
                 straintype, n_proj, n_snp_datasets, mpdshortname, url) = row
                # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html
                # create the strain as an instance of the taxon
                if self.testMode and \
                        'MPD:' + str(mpd_strainid) not in self.test_ids:
                    continue
                strain_id = 'MPD-strain:' + str(mpd_strainid)
                model.addIndividualToGraph(strain_id, strain_name, tax_id)
                if mpdshortname.strip() != '':
                    model.addSynonym(strain_id, mpdshortname.strip())
                self.idlabel_hash[strain_id] = strain_name
                # make it equivalent to the vendor+stock
                if stocknum != '':
                    if vendor == 'J':
                        jax_id = 'JAX:'+stocknum
                        model.addSameIndividual(strain_id, jax_id)
                    elif vendor == 'Rbrc':
                        # reiken
                        reiken_id = 'RBRC:'+re.sub(r'RBRC', '', stocknum)
                        model.addSameIndividual(strain_id, reiken_id)
                    else:
                        if url != '':
                            model.addXref(strain_id, url, True)
                        if vendor != '':
                            model.addXref(
                                strain_id, ':'.join((vendor, stocknum)),
                                True)

                # add the panel information
                if panel != '':
                    desc = panel+' [panel]'
                    model.addDescription(strain_id, desc)

                # TODO make the panels as a resource collection

        return

Esempio n. 5

Mostra file

File: RGD.py Progetto: TomConlin/dipper

    def make_association(self, record):
        """
        contstruct the association
        :param record:
        :return: modeled association of  genotype to mammalian phenotype
        """
        model = Model(self.graph)

        record['relation']['id'] = self.resolve("has phenotype")
        # define the triple
        gene = record['subject']['id']
        relation = record['relation']['id']
        phenotype = record['object']['id']

        # instantiate the association
        g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=phenotype, pred=relation)

        # add the references
        references = record['evidence']['has_supporting_reference']
        # created RGDRef prefix in curie map to route to proper reference URL in RGD
        references = [
            x.replace('RGD', 'RGDRef') if 'PMID' not in x else x for x in references]

        if len(references) > 0:
            # make first ref in list the source
            g2p_assoc.add_source(identifier=references[0])
            ref_model = Reference(
                self.graph, references[0],
                self.globaltt['publication']
            )
            ref_model.addRefToGraph()

        if len(references) > 1:
            # create equivalent source for any other refs in list
            # This seems to be specific to this source and
            # there could be non-equivalent references in this list
            for ref in references[1:]:
                model.addSameIndividual(sub=references[0], obj=ref)

        # add the date created on
        g2p_assoc.add_date(date=record['date'])
        g2p_assoc.add_evidence(self.resolve(record['evidence']['type']))  # ?set where?
        g2p_assoc.add_association_to_graph()

        return

Esempio n. 6

Mostra file

File: UCSCBands.py Progetto: TomConlin/dipper

    def _create_genome_builds(self):
        """
        Various resources will map variations to either UCSC (hg*)
        or to NCBI assemblies. Here we create the equivalences between them.
        Data taken from:
        https://genome.ucsc.edu/FAQ/FAQreleases.html#release1

        :return:

        """

        # TODO add more species

        graph = self.graph
        geno = Genotype(graph)
        model = Model(graph)
        LOG.info("Adding equivalent assembly identifiers")
        for sp in self.species:
            tax_id = self.globaltt[sp]
            txid_num = tax_id.split(':')[1]
            for key in self.files[txid_num]['assembly']:
                ucsc_id = key
                try:
                    ucsc_label = ucsc_id.split(':')[1]
                except IndexError:
                    LOG.error('%s Assembly id:  "%s" is problematic', sp, key)
                    continue
                if key in self.localtt:
                    mapped_id = self.localtt[key]
                else:
                    LOG.error(
                        '%s Assembly id:  "%s" is not in local translation table',
                        sp, key)

                mapped_label = mapped_id.split(':')[1]

                mapped_label = 'NCBI build ' + str(mapped_label)
                geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id)
                geno.addReferenceGenome(mapped_id, mapped_label, tax_id)
                model.addSameIndividual(ucsc_id, mapped_id)

        return

Esempio n. 7

Mostra file

File: MyDrug.py Progetto: TomConlin/dipper

    def _parse_aeolus_data(self, document, or_limit=None):
        model = Model(self.graph)

        rxcui_curie = "RXCUI:{}".format(document['aeolus']['rxcui'])
        uni_curie = "UNII:{}".format(document['aeolus']['unii'])
        model.addLabel(rxcui_curie, document['aeolus']['drug_name'])
        model.addLabel(uni_curie, document['aeolus']['drug_name'])

        model.addSameIndividual(rxcui_curie, uni_curie)
        self.graph.addTriple(
            rxcui_curie, self.globaltt['inchi_key'], document['unii']['inchikey'],
            object_is_literal=True)

        if or_limit is not None:
            outcomes = (outcome for outcome in document['aeolus']['outcomes']
                        if 'ror' in outcome and outcome['ror'] >= or_limit)
        else:
            outcomes = (outcome for outcome in document['aeolus']['outcomes'])

        for outcome in outcomes:
            drug2outcome_assoc = Assoc(self.graph, self.name)

            meddra_curie = "MEDDRA:{}".format(outcome['code'])
            model.addLabel(meddra_curie, outcome['name'])

            drug2outcome_assoc.sub = rxcui_curie
            drug2outcome_assoc.obj = meddra_curie
            drug2outcome_assoc.rel = self.globaltt['causes_or_contributes']
            drug2outcome_assoc.description = \
                "A proportional reporting ratio or odds " \
                "ratio greater than or equal to {} in the " \
                "AEOLUS data was the significance cut-off " \
                "used for creating drug-outcome associations".format(or_limit)
            drug2outcome_assoc.add_association_to_graph()
            drug2outcome_assoc.add_predicate_object(
                self.globaltt['probabalistic_quantifier'], outcome['ror'], 'Literal')

            self._add_outcome_evidence(drug2outcome_assoc.assoc_id, outcome)
            self._add_outcome_provenance(drug2outcome_assoc.assoc_id, outcome)

Esempio n. 8

Mostra file

File: UDP.py Progetto: TomConlin/dipper

    def _add_variant_sameas_relationships(self, patient_var_map, rs_map):
        """
        Adds same as relationships between udp variant bnodes and dbsnp ids
        :param patient_var_map:
        :param rs_map:
        :return:
        """
        model = Model(self.graph)
        for patient in patient_var_map:
            for variant_id, variant in patient_var_map[patient].items():
                variant_bnode = self.make_id("{0}".format(variant_id), "_")
                build = variant['build']
                chromosome = variant['chromosome']
                position = variant['position']
                reference_allele = variant['reference_allele']
                variant_allele = variant['variant_allele']
                if build and chromosome and position\
                        and reference_allele and variant_allele:
                    if re.fullmatch(r'[ATCG]', reference_allele)\
                            and re.fullmatch(r'[ATCG]', variant_allele):
                        # variation is snp
                        rs_id = self._get_rs_id(variant, rs_map, 'snp')
                        if rs_id:
                            dbsnp_curie = 'dbSNP:rs{0}'.format(rs_id)
                            model.addSameIndividual(variant_bnode, dbsnp_curie)

                    elif re.fullmatch(r'\-', reference_allele)\
                            or re.fullmatch(r'\-', variant_allele):
                        rs_id = self._get_rs_id(variant, rs_map, 'indel')
                        if rs_id is not None:
                            dbsnp_curie = 'dbSNP:rs{0}'.format(rs_id)
                            model.addSameIndividual(variant_bnode, dbsnp_curie)
                    else:
                        rs_id = self.\
                            _get_rs_id(variant, rs_map, 'indel')
                        if rs_id is not None:
                            dbsnp_curie = 'dbSNP:rs{0}'.format(rs_id)
                            model.addSameIndividual(variant_bnode, dbsnp_curie)
        return

Esempio n. 9

Mostra file

File: UDP.py Progetto: TomConlin/dipper

    def _parse_patient_variants(self, file):
        """
        :param file: file handler
        :return:
        """
        patient_var_map = self._convert_variant_file_to_dict(file)
        gene_coordinate_map = self._parse_gene_coordinates(
            self.map_files['gene_coord_map'])
        rs_map = self._parse_rs_map_file(self.map_files['dbsnp_map'])

        genotype = Genotype(self.graph)
        model = Model(self.graph)

        self._add_variant_gene_relationship(patient_var_map, gene_coordinate_map)

        for patient in patient_var_map:
            patient_curie = ':{0}'.format(patient)
            # make intrinsic genotype for each patient
            intrinsic_geno_bnode = self.make_id(
                "{0}-intrinsic-genotype".format(patient), "_")
            genotype_label = "{0} genotype".format(patient)
            genotype.addGenotype(
                intrinsic_geno_bnode, genotype_label,
                model.globaltt['intrinsic_genotype'])

            self.graph.addTriple(
                patient_curie, model.globaltt['has_genotype'], intrinsic_geno_bnode)
            for variant_id, variant in patient_var_map[patient].items():
                build = variant['build']
                chromosome = variant['chromosome']
                position = variant['position']
                reference_allele = variant['reference_allele']
                variant_allele = variant['variant_allele']
                genes_of_interest = variant['genes_of_interest']
                rs_id = variant['rs_id']

                variant_label = ''
                variant_bnode = self.make_id("{0}".format(variant_id), "_")

                # maybe should have these look like the elif statements below
                if position and reference_allele and variant_allele:
                    variant_label = self._build_variant_label(
                        build, chromosome, position, reference_allele,
                        variant_allele, genes_of_interest)
                elif not position and reference_allele and variant_allele \
                        and len(genes_of_interest) == 1:

                    variant_label = self._build_variant_label(
                        build, chromosome, position, reference_allele, variant_allele,
                        genes_of_interest)
                elif position and (not reference_allele or not variant_allele) \
                        and len(genes_of_interest) == 1:

                    variant_label = "{0}{1}({2}):g.{3}".format(
                        build, chromosome, genes_of_interest[0], position)
                elif len(genes_of_interest) == 1:
                    variant_label = 'variant of interest in {0} gene of patient' \
                        ' {1}'.format(genes_of_interest[0], patient)
                else:
                    variant_label = 'variant of interest in patient {0}'.format(patient)

                genotype.addSequenceAlteration(variant_bnode, None)
                # check if it we have built the label
                # in _add_variant_gene_relationship()
                labels = self.graph.objects(
                    BNode(re.sub(r'^_:', '', variant_bnode, 1)), RDFS['label'])

                label_list = list(labels)

                if len(label_list) == 0:
                    model.addLabel(variant_bnode, variant_label)

                self.graph.addTriple(
                    variant_bnode, self.globaltt['in taxon'],
                    self.globaltt['H**o sapiens'])
                self.graph.addTriple(
                    intrinsic_geno_bnode, self.globaltt['has_variant_part'],
                    variant_bnode)
                if rs_id:
                    dbsnp_curie = 'dbSNP:{0}'.format(rs_id)
                    model.addSameIndividual(variant_bnode, dbsnp_curie)

        self._add_variant_sameas_relationships(patient_var_map, rs_map)
        return

Esempio n. 10

Mostra file

File: NCBIGene.py Progetto: justaddcoffee/dipper

    def _add_gene_equivalencies(self, dbxrefs, gene_id, taxon):
        """
        Add equivalentClass and sameAs relationships

        Uses external resource map located in
        /resources/clique_leader.yaml to determine
        if an NCBITaxon ID space is a clique leader
        """

        clique_map = self.open_and_parse_yaml(self.resources['clique_leader'])

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        filter_out = ['Vega', 'IMGT/GENE-DB', 'Araport', '']

        # deal with the dbxrefs
        # MIM:614444|HGNC:HGNC:16851|Ensembl:ENSG00000136828|HPRD:11479|Vega:OTTHUMG00000020696

        for dbxref in dbxrefs.strip().split('|'):
            prefix = ':'.join(
                dbxref.split(':')[:-1]).strip()  # restore nonterminal ':'

            if prefix in self.localtt:
                prefix = self.localtt[prefix]

            # skip some of these for now based on curie prefix
            if prefix in filter_out:
                continue

            if prefix == 'AnimalQTLdb' and taxon in self.informal_species:
                prefix = self.informal_species[taxon] + 'QTL'

            dbxref_curie = ':'.join((prefix, dbxref.split(':')[-1]))
            if dbxref_curie is not None:
                if prefix == 'HPRD':  # proteins are not == genes.
                    model.addTriple(gene_id, self.globaltt['has gene product'],
                                    dbxref_curie)
                    continue

                if prefix == 'ENSEMBL':
                    model.addXref(gene_id, dbxref_curie)
                if prefix == 'OMIM':
                    omim_num = dbxref_curie[5:]
                    if omim_num in self.omim_replaced:
                        repl = self.omim_replaced[omim_num]
                        for omim in repl:
                            if omim in self.omim_type and \
                                    self.omim_type[omim] == self.globaltt['gene']:
                                dbxref_curie = 'OMIM:' + omim
                                model.addXref(gene_id, dbxref_curie)
                                omim_num = omim  # last wins

                    elif omim_num in self.omim_type and\
                            self.omim_type[omim_num] == self.globaltt['gene']:
                        model.addXref(gene_id, dbxref_curie)
                    else:
                        continue  # no equivilance between ncbigene and omin-nongene
                # designate clique leaders
                # (perhaps premature as this ingest can't know what else exists)
                try:
                    if self.class_or_indiv.get(gene_id) == 'C':
                        model.addEquivalentClass(gene_id, dbxref_curie)
                        if taxon in clique_map:
                            if clique_map[taxon] == prefix:
                                model.makeLeader(dbxref_curie)
                            elif clique_map[taxon] == gene_id.split(':')[0]:
                                model.makeLeader(gene_id)
                    else:
                        model.addSameIndividual(gene_id, dbxref_curie)
                except AssertionError as err:
                    LOG.warning("Error parsing %s: %s", gene_id, err)

Esempio n. 11

Mostra file

    def make_association(self, record):
        """
        contstruct the association
        :param record:
        :return: modeled association of  genotype to mammalian phenotype
        """
        # prep record
        # remove description and mapp Experiment Type to apo term
        experiment_type = record['Experiment Type'].split('(')[0]
        experiment_type = experiment_type.split(',')
        record['experiment_type'] = list()
        for exp_type in experiment_type:
            exp_type = exp_type.lstrip().rstrip()
            record['experiment_type'].append({
                'id': self.apo_term_id[exp_type],
                'term': exp_type,
            })
        sgd_phenotype = record['Phenotype']
        pheno_obj = {
            'entity': {
                'term': None,
                'apo_id': None
            },
            'quality': {
                'term': None,
                'apo_id': None
            },
            'has_quality':
            False  # False = phenotype was descriptive and don't bother looking for a quality
        }
        phenotype = record['Phenotype']
        if ':' in phenotype:
            pheno_obj['has_quality'] = True
            ent_qual = sgd_phenotype.split(': ')
            entity = ent_qual[0]
            quality = ent_qual[1]
            pheno_obj['entity']['term'] = entity
            pheno_obj['entity']['apo_id'] = self.apo_term_id[entity]
            pheno_obj['quality']['term'] = quality
            pheno_obj['quality']['apo_id'] = self.apo_term_id[quality]
        else:
            pheno_obj['entity']['term'] = phenotype
            pheno_obj['entity']['apo_id'] = self.apo_term_id[phenotype]
        record['pheno_obj'] = pheno_obj

        # begin modeling
        model = Model(self.graph)

        # define the triple
        gene = 'SGD:{}'.format(record['SGDID'])
        relation = Model.object_properties['has_phenotype']  # has phenotype

        if record['pheno_obj']['has_quality']:
            pheno_label = '{0}:{1}'.format(
                record['pheno_obj']['entity']['term'],
                record['pheno_obj']['quality']['term'])
            pheno_id = 'MONARCH:{0}{1}'.format(
                record['pheno_obj']['entity']['apo_id'].replace(':', '_'),
                record['pheno_obj']['quality']['apo_id'].replace(':', '_'))
            g2p_assoc = Assoc(self.graph,
                              self.name,
                              sub=gene,
                              obj=pheno_id,
                              pred=relation)
        else:
            pheno_label = record['pheno_obj']['entity']['term']
            pheno_id = record['pheno_obj']['entity']['apo_id']
            g2p_assoc = Assoc(self.graph,
                              self.name,
                              sub=gene,
                              obj=pheno_id,
                              pred=relation)
            assoc_id = g2p_assoc.make_association_id(
                definedby='yeastgenome.org',
                subject=gene,
                predicate=relation,
                object=pheno_id)
            g2p_assoc.set_association_id(assoc_id=assoc_id)

        # add to graph to mint assoc id
        g2p_assoc.add_association_to_graph()

        model.addLabel(subject_id=gene, label=record['Gene Name'])

        # add the association triple
        model.addTriple(subject_id=gene, predicate_id=relation, obj=pheno_id)

        # make pheno subclass of UPHENO:0001001
        model.addTriple(subject_id=pheno_id,
                        predicate_id=Model.object_properties['subclass_of'],
                        obj='UPHENO:0001001')

        # label nodes
        # pheno label
        model.addLabel(subject_id=pheno_id, label=pheno_label)

        # add the descripiton: all the unmodeled data in a '|' delimited list
        description = [
            'genomic_background: {}'.format(record['Strain Background']),
            'allele: {}'.format(record['Allele']),
            'chemical: {}'.format(record['Chemical']),
            'condition: {}'.format(record['Condition']),
            'details: {}'.format(record['Details']),
            'feature_name: {}'.format(record['Feature Name']),
            'gene_name: {}'.format(record['Gene Name']),
            'mutant_type: {}'.format(record['Mutant Type']),
            'reporter: {}'.format(record['Reporter']),
        ]
        g2p_assoc.description = " | ".join(description)

        # add the references
        references = record['Reference']
        references = references.replace(' ', '')
        references = references.split('|')

        #  created RGDRef prefix in curie map to route to proper reference URL in RGD
        if len(references) > 0:
            # make first ref in list the source
            g2p_assoc.add_source(identifier=references[0])
            ref_model = Reference(self.graph, references[0],
                                  Reference.ref_types['publication'])
            ref_model.addRefToGraph()

        if len(references) > 1:
            # create equivalent source for any other refs in list
            for ref in references[1:]:
                model.addSameIndividual(sub=references[0], obj=ref)

        # add experiment type as evidence
        for exp_type in record['experiment_type']:
            g2p_assoc.add_evidence(exp_type['id'])
            model.addLabel(subject_id=exp_type['id'], label=exp_type['term'])

        try:
            g2p_assoc.add_association_to_graph()
        except Exception as e:
            print(e)
        return

Esempio n. 12

Mostra file

File: UCSCBands.py Progetto: putmantime/dipper

    def _create_genome_builds(self):
        """
        Various resources will map variations to either UCSC (hg*)
        or to NCBI assemblies. Here we create the equivalences between them.
        Data taken from:
        https://genome.ucsc.edu/FAQ/FAQreleases.html#release1

        :return:

        """

        # TODO add more species
        ucsc_assembly_id_map = {
            "9606": {
                "UCSC:hg38": "NCBIGenome:GRCh38",
                "UCSC:hg19": "NCBIGenome:GRCh37",
                "UCSC:hg18": "NCBIGenome:36.1",
                "UCSC:hg17": "NCBIGenome:35",
                "UCSC:hg16": "NCBIGenome:34",
                "UCSC:hg15": "NCBIGenome:33",
            },
            "7955": {
                "UCSC:danRer10": "NCBIGenome:GRCz10",
                "UCSC:danRer7": "NCBIGenome:Zv9",
                "UCSC:danRer6": "NCBIGenome:Zv8",
            },
            "10090": {
                "UCSC:mm10": "NCBIGenome:GRCm38",
                "UCSC:mm9": "NCBIGenome:37"
            },
            "9031": {
                "UCSC:galGal4": "NCBIAssembly:317958",
            },
            "9913": {
                "UCSC:bosTau7": "NCBIAssembly:GCF_000003205.5",
            },
            "9823": {
                "UCSC:susScr3": "NCBIAssembly:304498",
            },
            "9940": {
                "UCSC:oviAri3": "NCBIAssembly:GCF_000298735.1",
            },
            "9796": {
                "UCSC:equCab2": "NCBIAssembly:GCF_000002305.2",
            }
        }
        g = self.graph
        geno = Genotype(g)
        model = Model(g)
        logger.info("Adding equivalent assembly identifiers")
        for sp in ucsc_assembly_id_map:
            tax_num = sp
            tax_id = 'NCBITaxon:' + tax_num
            mappings = ucsc_assembly_id_map[sp]
            for i in mappings:
                ucsc_id = i
                ucsc_label = re.split(':', i)[1]
                mapped_id = mappings[i]
                mapped_label = re.split(':', mapped_id)[1]
                mapped_label = 'NCBI build ' + str(mapped_label)
                geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id)
                geno.addReferenceGenome(mapped_id, mapped_label, tax_id)
                model.addSameIndividual(ucsc_id, mapped_id)

        return

Esempio n. 13

Mostra file

File: Coriell.py Progetto: TomConlin/dipper

    def _process_data(self, src_key, limit=None):
        """
        This function will process the data files from Coriell.
        We make the assumption that any alleles listed are variants
        (alternates to w.t.)

        Triples: (examples)

        :NIGMSrepository a CLO_0000008 #repository
        label : NIGMS Human Genetic Cell Repository
        foaf:page
         https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8

        line_id a CL_0000057,  #fibroblast line
            derives_from patient_id
            part_of :NIGMSrepository
            RO:model_of OMIM:disease_id

        patient id a foaf:person,
            label: "fibroblast from patient 12345 with disease X"
            member_of family_id  #what is the right thing here?
            SIO:race EFO:caucasian  #subclass of EFO:0001799
            in_taxon NCBITaxon:9606
            dc:description Literal(remark)
            RO:has_phenotype OMIM:disease_id
            GENO:has_genotype genotype_id

        family_id a owl:NamedIndividual
            foaf:page
             "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM"

        genotype_id a intrinsic_genotype
            GENO:has_alternate_part allelic_variant_id
            we don't necessarily know much about the genotype,
            other than the allelic variant. also there's the sex here

        pub_id mentions cell_line_id

        :param raw:
        :param limit:
        :return:

        """

        raw = '/'.join((self.rawdir, self.files[src_key]['file']))

        LOG.info("Processing Data from %s", raw)

        if self.test_mode:      # set the graph to build
            graph = self.testgraph
        else:
            graph = self.graph

        family = Family(graph)
        model = Model(graph)

        line_counter = 1
        geno = Genotype(graph)
        diputil = DipperUtil()
        col = self.files[src_key]['columns']
        # affords access with
        # x = row[col.index('x')].strip()

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar=r'"')
            # we can keep a close watch on changing file formats
            fileheader = next(filereader, None)
            fileheader = [c.lower() for c in fileheader]
            if col != fileheader:  # assert
                LOG.error('Expected  %s to have columns: %s', raw, col)
                LOG.error('But Found %s to have columns: %s', raw, fileheader)
                raise AssertionError('Incomming data headers have changed.')

            for row in filereader:
                line_counter += 1
                if len(row) != len(col):
                    LOG.warning(
                        'Expected %i values but find %i in  row %i',
                        len(col), len(row), line_counter)
                    continue

                # (catalog_id, description, omim_number, sample_type,
                # cell_line_available, dna_in_stock, dna_ref, gender, age,
                # race, ethnicity, affected, karyotype, relprob, mutation,
                # gene, family_id, collection, url, cat_remark, pubmed_ids,
                # family_member, variant_id, dbsnp_id, species) = row

                # example:
                # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,
                #       ,Female,26 YR,Caucasian,,,,
                # parent,,,39,NIGMS Human Genetic Cell Repository,
                # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003,
                # 46;XX; clinically normal mother of a child with Hurler syndrome;
                #       proband not in Repository,,
                # 2,,18343,H**o sapiens

                catalog_id = row[col.index('catalog_id')].strip()

                if self.test_mode and catalog_id not in self.test_lines:
                    # skip rows not in our test lines, when in test mode
                    continue

                # ###########    BUILD REQUIRED VARIABLES    ###########

                # Make the cell line ID
                cell_line_id = 'Coriell:' + catalog_id
                # Map the cell/sample type
                cell_type = self.resolve(row[col.index('sample_type')].strip())
                # on fail cell_type = self.globaltt['cell'] ?

                # Make a cell line label
                collection = row[col.index('collection')].strip()
                line_label = collection.partition(' ')[0] + '-' + catalog_id

                # Map the repository/collection
                repository = self.localtt[collection]

                # patients are uniquely identified by one of:
                # dbsnp id (which is == an individual haplotype)
                # family id + family member (if present) OR
                # probands are usually family member zero
                # cell line id
                # since some patients have >1 cell line derived from them,
                # we must make sure that the genotype is attached to
                # the patient, and can be inferred to the cell line
                # examples of repeated patients are:
                #   famid=1159, member=1; fam=152,member=1

                # Make the patient ID

                # make an anonymous patient
                patient_id = '_:person'
                fam_id = row[col.index('fam')].strip()
                fammember = row[col.index('fammember')].strip()
                if fam_id != '':
                    patient_id = '-'.join((patient_id, fam_id, fammember))
                else:
                    # make an anonymous patient
                    patient_id = '-'.join((patient_id, catalog_id))

                # properties of the individual patients:  sex, family id,
                # member/relproband, description descriptions are
                # really long and ugly SCREAMING text, so need to clean up
                # the control cases are so odd with this labeling scheme;
                # but we'll deal with it as-is for now.
                description = row[col.index('description')].strip()
                short_desc = (description.split(';')[0]).capitalize()

                gender = row[col.index('gender')].strip().lower()
                affected = row[col.index('affected')].strip()
                relprob = row[col.index('relprob')].strip()

                if affected == '':
                    affected = 'unspecified'
                elif affected in self.localtt:
                    affected = self.localtt[affected]
                else:
                    LOG.warning(
                        'Novel Affected status  %s at row: %i of %s',
                        affected, line_counter, raw)
                patient_label = ' '.join((affected, gender, relprob))
                if relprob == 'proband':
                    patient_label = ' '.join((
                        patient_label.strip(), 'with', short_desc))
                else:
                    patient_label = ' '.join((
                        patient_label.strip(), 'of proband with', short_desc))

                # #############    BUILD THE CELL LINE    #############

                # Adding the cell line as a typed individual.
                cell_line_reagent_id = self.globaltt['cell line']

                model.addIndividualToGraph(
                    cell_line_id, line_label, cell_line_reagent_id)

                # add the equivalent id == dna_ref
                dna_ref = row[col.index('dna_ref')].strip()
                if dna_ref != '' and dna_ref != catalog_id:
                    equiv_cell_line = 'Coriell:' + dna_ref
                    # some of the equivalent ids are not defined
                    # in the source data; so add them
                    model.addIndividualToGraph(
                        equiv_cell_line, None, cell_line_reagent_id)
                    model.addSameIndividual(cell_line_id, equiv_cell_line)

                # Cell line derives from patient
                geno.addDerivesFrom(cell_line_id, patient_id)
                geno.addDerivesFrom(cell_line_id, cell_type)

                # Cell line a member of repository
                family.addMember(repository, cell_line_id)

                cat_remark = row[col.index('cat_remark')].strip()

                if cat_remark != '':
                    model.addDescription(cell_line_id, cat_remark)

                # Cell age_at_sampling
                # TODO add the age nodes when modeled properly in #78
                # if (age != ''):
                    # this would give a BNode that is an instance of Age.
                    # but i don't know how to connect
                    # the age node to the cell line? we need to ask @mbrush
                    # age_id = '_'+re.sub('\s+','_',age)
                    # gu.addIndividualToGraph(
                    #   graph,age_id,age,self.globaltt['age'])
                    # gu.addTriple(
                    #   graph,age_id,self.globaltt['has measurement value'],age,
                    #   True)

                # #############    BUILD THE PATIENT    #############

                # Add the patient ID as an individual.
                model.addPerson(patient_id, patient_label)
                # TODO map relationship to proband as a class
                # (what ontology?)

                # Add race of patient
                # FIXME: Adjust for subcategories based on ethnicity field
                # EDIT: There are 743 different entries for ethnicity...
                # Too many to map?
                # Add ethnicity as literal in addition to the mapped race?
                # Adjust the ethnicity txt (if using)
                # to initial capitalization to remove ALLCAPS

                # TODO race should go into the individual's background
                # and abstracted out to the Genotype class punting for now.
                # if race != '':
                #    mapped_race = self.resolve(race)
                #    if mapped_race is not None:
                #        gu.addTriple(
                #           g,patient_id,self.globaltt['race'], mapped_race)
                #        model.addSubClass(
                #           mapped_race,self.globaltt['ethnic_group'])

                # #############    BUILD THE FAMILY    #############

                # Add triples for family_id, if present.
                if fam_id != '':
                    family_comp_id = 'CoriellFamily:' + fam_id

                    family_label = ' '.join(('Family of proband with', short_desc))

                    # Add the family ID as a named individual
                    model.addIndividualToGraph(
                        family_comp_id, family_label, self.globaltt['family'])

                    # Add the patient as a member of the family
                    family.addMemberOf(patient_id, family_comp_id)

                # #############    BUILD THE GENOTYPE   #############

                # the important things to pay attention to here are:
                # karyotype = chr rearrangements  (somatic?)
                # mutation = protein-level mutation as a label,
                # often from omim
                # gene = gene symbol - TODO get id
                # variant_id = omim variant ids (; delimited)
                # dbsnp_id = snp individual ids = full genotype?

                # note GM00633 is a good example of chromosomal variation
                # - do we have enough to capture this?
                # GM00325 has both abnormal karyotype and variation

                # make an assumption that if the taxon is blank,
                # that it is human!
                species = row[col.index('species')].strip()
                if species is None or species == '':
                    species = 'H**o sapiens'
                taxon = self.resolve(species)

                # if there's a dbSNP id,
                # this is actually the individual's genotype
                genotype_id = None
                genotype_label = None

                dbsnp_id = row[col.index('dbsnp_id')].strip()
                if dbsnp_id != '':
                    genotype_id = 'dbSNPIndividual:' + dbsnp_id

                omim_map = {}
                gvc_id = None

                # some of the karyotypes are encoded
                # with terrible hidden codes. remove them here
                # i've seen a <98> character
                karyotype = row[col.index('karyotype')].strip()
                karyotype = diputil.remove_control_characters(karyotype)
                karyotype_id = None
                if karyotype.strip() != '':
                    karyotype_id = '_:'+re.sub(
                        'MONARCH:', '', self.make_id(karyotype))
                    # add karyotype as karyotype_variation_complement
                    model.addIndividualToGraph(
                        karyotype_id, karyotype,
                        self.globaltt['karyotype_variation_complement'])
                    # TODO break down the karyotype into parts
                    # and map into GENO. depends on #77

                    # place the karyotype in a location(s).
                    karyo_chrs = self._get_affected_chromosomes_from_karyotype(
                        karyotype)
                    for chrom in karyo_chrs:
                        chr_id = makeChromID(chrom, taxon, 'CHR')
                        # add an anonymous sequence feature,
                        # each located on chr
                        karyotype_feature_id = '-'.join((karyotype_id, chrom))
                        karyotype_feature_label = \
                            'some karyotype alteration on chr' + str(chrom)
                        feat = Feature(
                            graph, karyotype_feature_id, karyotype_feature_label,
                            self.globaltt['sequence_alteration'])
                        feat.addFeatureStartLocation(None, chr_id)
                        feat.addFeatureToGraph()
                        geno.addParts(
                            karyotype_feature_id, karyotype_id,
                            self.globaltt['has_variant_part'])

                gene = row[col.index('gene')].strip()
                mutation = row[col.index('mutation')].strip()
                if gene != '':
                    varl = gene + '(' + mutation + ')'

                # fix the variant_id so it's always in the same order
                variant_id = row[col.index('variant_id')].strip()
                vids = variant_id.split(';')
                variant_id = ';'.join(sorted(list(set(vids))))

                if karyotype.strip() != '' and not self._is_normal_karyotype(
                        karyotype):

                    gvc_id = karyotype_id
                    if variant_id != '':
                        gvc_id = '_:' + variant_id.replace(';', '-') + '-' \
                            + re.sub(r'\w*:', '', karyotype_id)
                    if mutation.strip() != '':
                        gvc_label = '; '.join((varl, karyotype))
                    else:
                        gvc_label = karyotype
                elif variant_id.strip() != '':
                    gvc_id = '_:' + variant_id.replace(';', '-')
                    gvc_label = varl
                else:
                    # wildtype?
                    pass

                # add the karyotype to the gvc.
                # use reference if normal karyotype
                karyo_rel = self.globaltt['has_variant_part']
                if self._is_normal_karyotype(karyotype):
                    karyo_rel = self.globaltt['has_reference_part']
                if karyotype_id is not None \
                        and not self._is_normal_karyotype(karyotype) \
                        and gvc_id is not None and karyotype_id != gvc_id:
                    geno.addParts(karyotype_id, gvc_id, karyo_rel)

                if variant_id.strip() != '':
                    # split the variants & add them as part of the genotype
                    # we don't necessarily know their zygosity,
                    # just that they are part of the genotype variant ids
                    # are from OMIM, so prefix as such we assume that the
                    # sequence alts will be defined in OMIM not here
                    # TODO sort the variant_id list, if the omim prefix is
                    # the same, then assume it's the locus make a hashmap
                    # of the omim id to variant id list;
                    # then build the genotype hashmap is also useful for
                    # removing the "genes" from the list of "phenotypes"

                    # will hold gene/locus id to variant list
                    omim_map = {}

                    locus_num = None
                    for var in variant_id.split(';'):
                        # handle omim-style and odd var ids
                        # like 610661.p.R401X
                        mch = re.match(r'(\d+)\.+(.*)', var.strip())
                        if mch is not None and len(mch.groups()) == 2:
                            (locus_num, var_num) = mch.groups()

                        if locus_num is not None and locus_num not in omim_map:
                            omim_map[locus_num] = [var_num]
                        else:
                            omim_map[locus_num] += [var_num]

                    for omim in omim_map:
                        # gene_id = 'OMIM:' + omim  # TODO unused
                        vslc_id = '_:' + '-'.join(
                            [omim + '.' + a for a in omim_map.get(omim)])
                        vslc_label = varl
                        # we don't really know the zygosity of
                        # the alleles at all.
                        # so the vslcs are just a pot of them
                        model.addIndividualToGraph(
                            vslc_id, vslc_label,
                            self.globaltt['variant single locus complement'])
                        for var in omim_map.get(omim):
                            # this is actually a sequence alt
                            allele1_id = 'OMIM:' + omim + '.' + var
                            geno.addSequenceAlteration(allele1_id, None)

                            # assume that the sa -> var_loc -> gene
                            # is taken care of in OMIM
                            geno.addPartsToVSLC(
                                vslc_id, allele1_id, None,
                                self.globaltt['indeterminate'],
                                self.globaltt['has_variant_part'])

                        if vslc_id != gvc_id:
                            geno.addVSLCtoParent(vslc_id, gvc_id)

                if affected == 'unaffected':
                    # let's just say that this person is wildtype
                    model.addType(patient_id, self.globaltt['wildtype'])
                elif genotype_id is None:
                    # make an anonymous genotype id (aka blank node)
                    genotype_id = '_:geno' + catalog_id.strip()

                # add the gvc
                if gvc_id is not None:
                    model.addIndividualToGraph(
                        gvc_id, gvc_label,
                        self.globaltt['genomic_variation_complement'])

                    # add the gvc to the genotype
                    if genotype_id is not None:
                        if affected == 'unaffected':
                            rel = self.globaltt['has_reference_part']
                        else:
                            rel = self.globaltt['has_variant_part']
                        geno.addParts(gvc_id, genotype_id, rel)

                    if karyotype_id is not None \
                            and self._is_normal_karyotype(karyotype):
                        if gvc_label is not None and gvc_label != '':
                            genotype_label = '; '.join((gvc_label, karyotype))
                        elif karyotype is not None:
                            genotype_label = karyotype
                        if genotype_id is None:
                            genotype_id = karyotype_id
                        else:
                            geno.addParts(
                                karyotype_id, genotype_id,
                                self.globaltt['has_reference_part'])
                    else:
                        genotype_label = gvc_label
                        # use the catalog id as the background
                    genotype_label += ' ['+catalog_id.strip()+']'

                if genotype_id is not None and gvc_id is not None:
                    # only add the genotype if it has some parts
                    geno.addGenotype(
                        genotype_id, genotype_label,
                        self.globaltt['intrinsic_genotype'])
                    geno.addTaxon(taxon, genotype_id)
                    # add that the patient has the genotype
                    # TODO check if the genotype belongs to
                    # the cell line or to the patient
                    graph.addTriple(
                        patient_id, self.globaltt['has_genotype'], genotype_id)
                else:
                    geno.addTaxon(taxon, patient_id)

                # TODO: Add sex/gender  (as part of the karyotype?)
                # = row[col.index('')].strip()
                # #############    DEAL WITH THE DISEASES   #############
                omim_num = row[col.index('omim_num')].strip()

                # we associate the disease to the patient
                if affected == 'affected' and omim_num != '':
                    for disease in omim_num.split(';'):
                        if disease is not None and disease != '':
                            # if the omim number is in omim_map,
                            # then it is a gene not a pheno

                            # TEC - another place to use the mimTitle omim
                            # classifier omia & genereviews are using

                            if disease not in omim_map:
                                disease_id = 'OMIM:' + disease.strip()
                                # assume the label is taken care of in OMIM
                                model.addClassToGraph(disease_id, None)

                                # add the association:
                                #   the patient has the disease
                                assoc = G2PAssoc(
                                    graph, self.name,
                                    patient_id, disease_id)
                                assoc.add_association_to_graph()

                                # this line is a model of this disease
                                # TODO abstract out model into
                                # it's own association class?
                                graph.addTriple(
                                    cell_line_id,
                                    self.globaltt['is model of'],
                                    disease_id)
                            else:
                                LOG.info('drop gene %s from disease list', disease)

                # #############    ADD PUBLICATIONS   #############
                pubmed_ids = row[col.index('pubmed_ids')].strip()
                if pubmed_ids != '':
                    for pmid in pubmed_ids.split(';'):
                        pubmed_id = 'PMID:' + pmid.strip()
                        ref = Reference(graph, pubmed_id)
                        ref.setType(self.globaltt['journal article'])
                        ref.addRefToGraph()
                        graph.addTriple(
                            pubmed_id, self.globaltt['mentions'], cell_line_id)

                if not self.test_mode and (
                        limit is not None and line_counter > limit):
                    break
        return

Esempio n. 14

Mostra file

    def _process_data(self, raw, limit=None):
        """
        This function will process the data files from Coriell.
        We make the assumption that any alleles listed are variants
        (alternates to w.t.)

        Triples: (examples)

        :NIGMSrepository a CLO_0000008 #repository
        label : NIGMS Human Genetic Cell Repository
        foaf:page
         https://catalog.coriell.org/0/sections/collections/NIGMS/?SsId=8

        line_id a CL_0000057,  #fibroblast line
            derives_from patient_id
            part_of :NIGMSrepository
            RO:model_of OMIM:disease_id

        patient id a foaf:person,
            label: "fibroblast from patient 12345 with disease X"
            member_of family_id  #what is the right thing here?
            SIO:race EFO:caucasian  #subclass of EFO:0001799
            in_taxon NCBITaxon:9606
            dc:description Literal(remark)
            RO:has_phenotype OMIM:disease_id
            GENO:has_genotype genotype_id

        family_id a owl:NamedIndividual
            foaf:page
             "https://catalog.coriell.org/0/Sections/BrowseCatalog/FamilyTypeSubDetail.aspx?PgId=402&fam=2104&coll=GM"

        genotype_id a intrinsic_genotype
            GENO:has_alternate_part allelic_variant_id
            we don't necessarily know much about the genotype,
            other than the allelic variant. also there's the sex here

        pub_id mentions cell_line_id

        :param raw:
        :param limit:
        :return:

        """

        logger.info("Processing Data from %s", raw)

        if self.testMode:  # set the graph to build
            g = self.testgraph
        else:
            g = self.graph

        family = Family(g)
        model = Model(g)

        line_counter = 0
        geno = Genotype(g)
        du = DipperUtil()

        with open(raw, 'r', encoding="iso-8859-1") as csvfile:
            filereader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            next(filereader, None)  # skip the header row
            for row in filereader:
                if not row:
                    pass
                else:
                    line_counter += 1

                    (catalog_id, description, omim_number, sample_type,
                     cell_line_available, dna_in_stock, dna_ref, gender, age,
                     race, ethnicity, affected, karyotype, relprob, mutation,
                     gene, family_id, collection, url, cat_remark, pubmed_ids,
                     family_member, variant_id, dbsnp_id, species) = row

                    # example:
                    # GM00003,HURLER SYNDROME,607014,Fibroblast,Yes,No,,Female,26 YR,Caucasian,,,,
                    # parent,,,39,NIGMS Human Genetic Cell Repository,
                    # http://ccr.coriell.org/Sections/Search/Sample_Detail.aspx?Ref=GM00003,
                    # 46;XX; clinically normal mother of a child with Hurler syndrome; proband not in Repository,,
                    # 2,,18343,H**o sapiens

                    if self.testMode and catalog_id not in self.test_lines:
                        # skip rows not in our test lines, when in test mode
                        continue

                    # ###########    BUILD REQUIRED VARIABLES    ###########

                    # Make the cell line ID
                    cell_line_id = 'Coriell:' + catalog_id.strip()

                    # Map the cell/sample type
                    cell_type = self._map_cell_type(sample_type)

                    # Make a cell line label
                    line_label = \
                        collection.partition(' ')[0]+'-'+catalog_id.strip()

                    # Map the repository/collection
                    repository = self._map_collection(collection)

                    # patients are uniquely identified by one of:
                    # dbsnp id (which is == an individual haplotype)
                    # family id + family member (if present) OR
                    # probands are usually family member zero
                    # cell line id
                    # since some patients have >1 cell line derived from them,
                    # we must make sure that the genotype is attached to
                    # the patient, and can be inferred to the cell line
                    # examples of repeated patients are:
                    #   famid=1159, member=1; fam=152,member=1

                    # Make the patient ID

                    # make an anonymous patient
                    patient_id = '_:person'
                    if family_id != '':
                        patient_id = \
                            '-'.join((patient_id, family_id, family_member))
                    else:
                        # make an anonymous patient
                        patient_id = '-'.join((patient_id, catalog_id.strip()))

                    # properties of the individual patients:  sex, family id,
                    # member/relproband, description descriptions are
                    # really long and ugly SCREAMING text, so need to clean up
                    # the control cases are so odd with this labeling scheme;
                    # but we'll deal with it as-is for now.
                    short_desc = (description.split(';')[0]).capitalize()
                    if affected == 'Yes':
                        affected = 'affected'
                    elif affected == 'No':
                        affected = 'unaffected'
                    gender = gender.lower()
                    patient_label = ' '.join((affected, gender, relprob))
                    if relprob == 'proband':
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'with', short_desc))
                    else:
                        patient_label = \
                            ' '.join(
                                (patient_label.strip(), 'of proband with',
                                 short_desc))

                    # #############    BUILD THE CELL LINE    #############

                    # Adding the cell line as a typed individual.
                    cell_line_reagent_id = 'CLO:0000031'

                    model.addIndividualToGraph(cell_line_id, line_label,
                                               cell_line_reagent_id)

                    # add the equivalent id == dna_ref
                    if dna_ref != '' and dna_ref != catalog_id:
                        equiv_cell_line = 'Coriell:' + dna_ref
                        # some of the equivalent ids are not defined
                        # in the source data; so add them
                        model.addIndividualToGraph(equiv_cell_line, None,
                                                   cell_line_reagent_id)
                        model.addSameIndividual(cell_line_id, equiv_cell_line)

                    # Cell line derives from patient
                    geno.addDerivesFrom(cell_line_id, patient_id)
                    geno.addDerivesFrom(cell_line_id, cell_type)

                    # Cell line a member of repository
                    family.addMember(repository, cell_line_id)

                    if cat_remark != '':
                        model.addDescription(cell_line_id, cat_remark)

                    # Cell age_at_sampling
                    # TODO add the age nodes when modeled properly in #78
                    # if (age != ''):
                    # this would give a BNode that is an instance of Age.
                    # but i don't know how to connect
                    # the age node to the cell line? we need to ask @mbrush
                    # age_id = '_'+re.sub('\s+','_',age)
                    # gu.addIndividualToGraph(
                    #   g,age_id,age,self.terms['age'])
                    # gu.addTriple(
                    #   g,age_id,self.properties['has_measurement'],age,
                    #   True)

                    # #############    BUILD THE PATIENT    #############

                    # Add the patient ID as an individual.
                    model.addPerson(patient_id, patient_label)
                    # TODO map relationship to proband as a class
                    # (what ontology?)

                    # Add race of patient
                    # FIXME: Adjust for subcategories based on ethnicity field
                    # EDIT: There are 743 different entries for ethnicity...
                    # Too many to map?
                    # Add ethnicity as literal in addition to the mapped race?
                    # Adjust the ethnicity txt (if using)
                    # to initial capitalization to remove ALLCAPS

                    # TODO race should go into the individual's background
                    # and abstracted out to the Genotype class punting for now.
                    # if race != '':
                    #    mapped_race = self._map_race(race)
                    #    if mapped_race is not None:
                    #        gu.addTriple(
                    #           g,patient_id,self.terms['race'],mapped_race)
                    #        model.addSubClass(
                    #           mapped_race,self.terms['ethnic_group'])

                    # #############    BUILD THE FAMILY    #############

                    # Add triples for family_id, if present.
                    if family_id != '':
                        family_comp_id = 'CoriellFamily:' + family_id

                        family_label = \
                            ' '.join(('Family of proband with', short_desc))

                        # Add the family ID as a named individual
                        model.addIndividualToGraph(family_comp_id,
                                                   family_label,
                                                   geno.genoparts['family'])

                        # Add the patient as a member of the family
                        family.addMemberOf(patient_id, family_comp_id)

                    # #############    BUILD THE GENOTYPE   #############

                    # the important things to pay attention to here are:
                    # karyotype = chr rearrangements  (somatic?)
                    # mutation = protein-level mutation as a label,
                    # often from omim
                    # gene = gene symbol - TODO get id
                    # variant_id = omim variant ids (; delimited)
                    # dbsnp_id = snp individual ids = full genotype?

                    # note GM00633 is a good example of chromosomal variation
                    # - do we have enough to capture this?
                    # GM00325 has both abnormal karyotype and variation

                    # make an assumption that if the taxon is blank,
                    # that it is human!
                    if species is None or species == '':
                        species = 'H**o sapiens'
                    taxon = self._map_species(species)

                    # if there's a dbSNP id,
                    # this is actually the individual's genotype
                    genotype_id = None
                    genotype_label = None
                    if dbsnp_id != '':
                        genotype_id = 'dbSNPIndividual:' + dbsnp_id.strip()

                    omim_map = {}
                    gvc_id = None

                    # some of the karyotypes are encoded
                    # with terrible hidden codes. remove them here
                    # i've seen a <98> character
                    karyotype = du.remove_control_characters(karyotype)
                    karyotype_id = None
                    if karyotype.strip() != '':
                        karyotype_id = \
                            '_:'+re.sub(
                                'MONARCH:', '', self.make_id(karyotype))
                        # add karyotype as karyotype_variation_complement
                        model.addIndividualToGraph(
                            karyotype_id, karyotype,
                            geno.genoparts['karyotype_variation_complement'])
                        # TODO break down the karyotype into parts
                        # and map into GENO. depends on #77

                        # place the karyotype in a location(s).
                        karyo_chrs = \
                            self._get_affected_chromosomes_from_karyotype(
                                karyotype)
                        for c in karyo_chrs:
                            chr_id = makeChromID(c, taxon, 'CHR')
                            # add an anonymous sequence feature,
                            # each located on chr
                            karyotype_feature_id = '-'.join((karyotype_id, c))
                            karyotype_feature_label = \
                                'some karyotype alteration on chr'+str(c)
                            f = Feature(g, karyotype_feature_id,
                                        karyotype_feature_label,
                                        geno.genoparts['sequence_alteration'])
                            f.addFeatureStartLocation(None, chr_id)
                            f.addFeatureToGraph()
                            geno.addParts(
                                karyotype_feature_id, karyotype_id,
                                geno.object_properties['has_alternate_part'])

                    if gene != '':
                        vl = gene + '(' + mutation + ')'

                    # fix the variant_id so it's always in the same order
                    vids = variant_id.split(';')
                    variant_id = ';'.join(sorted(list(set(vids))))

                    if karyotype.strip() != '' \
                            and not self._is_normal_karyotype(karyotype):
                        mutation = mutation.strip()
                        gvc_id = karyotype_id
                        if variant_id != '':
                            gvc_id = \
                                '_:' + variant_id.replace(';', '-') + '-' \
                                + re.sub(r'\w*:', '', karyotype_id)
                        if mutation.strip() != '':
                            gvc_label = '; '.join((vl, karyotype))
                        else:
                            gvc_label = karyotype
                    elif variant_id.strip() != '':
                        gvc_id = '_:' + variant_id.replace(';', '-')
                        gvc_label = vl
                    else:
                        # wildtype?
                        pass

                    # add the karyotype to the gvc.
                    # use reference if normal karyotype
                    karyo_rel = geno.object_properties['has_alternate_part']
                    if self._is_normal_karyotype(karyotype):
                        karyo_rel = \
                            geno.object_properties['has_reference_part']
                    if karyotype_id is not None \
                            and not self._is_normal_karyotype(karyotype) \
                            and gvc_id is not None and karyotype_id != gvc_id:
                        geno.addParts(karyotype_id, gvc_id, karyo_rel)

                    if variant_id.strip() != '':
                        # split the variants & add them as part of the genotype
                        # we don't necessarily know their zygosity,
                        # just that they are part of the genotype variant ids
                        # are from OMIM, so prefix as such we assume that the
                        # sequence alts will be defined in OMIM not here
                        # TODO sort the variant_id list, if the omim prefix is
                        # the same, then assume it's the locus make a hashmap
                        # of the omim id to variant id list;
                        # then build the genotype hashmap is also useful for
                        # removing the "genes" from the list of "phenotypes"

                        # will hold gene/locus id to variant list
                        omim_map = {}

                        locus_num = None
                        for v in variant_id.split(';'):
                            # handle omim-style and odd var ids
                            # like 610661.p.R401X
                            m = re.match(r'(\d+)\.+(.*)', v.strip())
                            if m is not None and len(m.groups()) == 2:
                                (locus_num, var_num) = m.groups()

                            if locus_num is not None \
                                    and locus_num not in omim_map:
                                omim_map[locus_num] = [var_num]
                            else:
                                omim_map[locus_num] += [var_num]

                        for o in omim_map:
                            # gene_id = 'OMIM:' + o  # TODO unused
                            vslc_id = \
                                '_:' + '-'.join(
                                    [o + '.' + a for a in omim_map.get(o)])
                            vslc_label = vl
                            # we don't really know the zygosity of
                            # the alleles at all.
                            # so the vslcs are just a pot of them
                            model.addIndividualToGraph(
                                vslc_id, vslc_label, geno.
                                genoparts['variant_single_locus_complement'])
                            for v in omim_map.get(o):
                                # this is actually a sequence alt
                                allele1_id = 'OMIM:' + o + '.' + v
                                geno.addSequenceAlteration(allele1_id, None)

                                # assume that the sa -> var_loc -> gene
                                # is taken care of in OMIM
                                geno.addPartsToVSLC(
                                    vslc_id, allele1_id, None,
                                    geno.zygosity['indeterminate'], geno.
                                    object_properties['has_alternate_part'])

                            if vslc_id != gvc_id:
                                geno.addVSLCtoParent(vslc_id, gvc_id)

                    if affected == 'unaffected':
                        # let's just say that this person is wildtype
                        model.addType(patient_id, geno.genoparts['wildtype'])
                    elif genotype_id is None:
                        # make an anonymous genotype id
                        genotype_id = '_:geno' + catalog_id.strip()

                    # add the gvc
                    if gvc_id is not None:
                        model.addIndividualToGraph(
                            gvc_id, gvc_label,
                            geno.genoparts['genomic_variation_complement'])

                        # add the gvc to the genotype
                        if genotype_id is not None:
                            if affected == 'unaffected':
                                rel = \
                                    geno.object_properties[
                                        'has_reference_part']
                            else:
                                rel = \
                                    geno.object_properties[
                                        'has_alternate_part']
                            geno.addParts(gvc_id, genotype_id, rel)
                        if karyotype_id is not None \
                                and self._is_normal_karyotype(karyotype):
                            if gvc_label is not None and gvc_label != '':
                                genotype_label = \
                                    '; '.join((gvc_label, karyotype))
                            else:
                                genotype_label = karyotype
                            if genotype_id is None:
                                genotype_id = karyotype_id
                            else:
                                geno.addParts(
                                    karyotype_id, genotype_id, geno.
                                    object_properties['has_reference_part'])
                        else:
                            genotype_label = gvc_label
                            # use the catalog id as the background
                        genotype_label += ' [' + catalog_id.strip() + ']'

                    if genotype_id is not None and gvc_id is not None:
                        # only add the genotype if it has some parts
                        geno.addGenotype(genotype_id, genotype_label,
                                         geno.genoparts['intrinsic_genotype'])
                        geno.addTaxon(taxon, genotype_id)
                        # add that the patient has the genotype
                        # TODO check if the genotype belongs to
                        # the cell line or to the patient
                        g.addTriple(patient_id,
                                    geno.properties['has_genotype'],
                                    genotype_id)
                    else:
                        geno.addTaxon(taxon, patient_id)

                    # TODO: Add sex/gender  (as part of the karyotype?)

                    # #############    DEAL WITH THE DISEASES   #############

                    # we associate the disease to the patient
                    if affected == 'affected':
                        if omim_number != '':
                            for d in omim_number.split(';'):
                                if d is not None and d != '':
                                    # if the omim number is in omim_map,
                                    # then it is a gene not a pheno
                                    if d not in omim_map:
                                        disease_id = 'OMIM:' + d.strip()
                                        # assume the label is taken care of
                                        model.addClassToGraph(disease_id, None)

                                        # add the association:
                                        #   the patient has the disease
                                        assoc = G2PAssoc(
                                            g, self.name, patient_id,
                                            disease_id)
                                        assoc.add_association_to_graph()

                                        # this line is a model of this disease
                                        # TODO abstract out model into
                                        # it's own association class?
                                        g.addTriple(
                                            cell_line_id, model.
                                            object_properties['model_of'],
                                            disease_id)
                                    else:
                                        logger.info(
                                            'removing %s from disease list ' +
                                            'since it is a gene', d)

                    # #############    ADD PUBLICATIONS   #############

                    if pubmed_ids != '':
                        for s in pubmed_ids.split(';'):
                            pubmed_id = 'PMID:' + s.strip()
                            ref = Reference(g, pubmed_id)
                            ref.setType(Reference.ref_types['journal_article'])
                            ref.addRefToGraph()
                            g.addTriple(pubmed_id,
                                        model.object_properties['mentions'],
                                        cell_line_id)

                    if not self.testMode \
                            and (limit is not None and line_counter > limit):
                        break
        return

Esempio n. 15

Mostra file

File: SGD.py Progetto: justaddcoffee/dipper

    def make_association(self, record):
        """
        contstruct the association
        :param record:
        :return: modeled association of  genotype to mammalian??? phenotype
        """
        # prep record
        # remove description and mapp Experiment Type to apo term
        experiment_type = record['Experiment Type'].split('(')[0]
        experiment_type = experiment_type.split(',')
        record['experiment_type'] = list()
        for exp_type in experiment_type:
            exp_type = exp_type.lstrip().rstrip()
            record['experiment_type'].append({
                'id': self.apo_term_id[exp_type],
                'term': exp_type,
            })
        sgd_phenotype = record['Phenotype']
        pheno_obj = {
            'entity': {
                'term': None,
                'apo_id': None
            },
            'quality': {
                'term': None,
                'apo_id': None
            },
            'has_quality':
            False  # descriptive and don't bother looking for a quality
        }
        phenotype = record['Phenotype']
        if ':' in phenotype:
            pheno_obj['has_quality'] = True
            ent_qual = sgd_phenotype.split(': ')
            entity = ent_qual[0]
            quality = ent_qual[1]
            pheno_obj['entity']['term'] = entity
            pheno_obj['entity']['apo_id'] = self.apo_term_id[entity]
            pheno_obj['quality']['term'] = quality
            pheno_obj['quality']['apo_id'] = self.apo_term_id[quality]
        else:
            pheno_obj['entity']['term'] = phenotype
            pheno_obj['entity']['apo_id'] = self.apo_term_id[phenotype]
        record['pheno_obj'] = pheno_obj

        # begin modeling
        model = Model(self.graph)

        # define the triple
        gene = 'SGD:{}'.format(record['SGDID'])
        relation = self.globaltt['has phenotype']

        if record['pheno_obj']['has_quality']:
            pheno_label = '{0}:{1}'.format(
                record['pheno_obj']['entity']['term'],
                record['pheno_obj']['quality']['term'])
            pheno_id = 'MONARCH:{0}{1}'.format(
                record['pheno_obj']['entity']['apo_id'].replace(':', '_'),
                record['pheno_obj']['quality']['apo_id'].replace(':', '_'))
            g2p_assoc = Assoc(self.graph,
                              self.name,
                              sub=gene,
                              obj=pheno_id,
                              pred=relation)
        else:
            pheno_label = record['pheno_obj']['entity']['term']
            pheno_id = record['pheno_obj']['entity']['apo_id']
            g2p_assoc = Assoc(self.graph,
                              self.name,
                              sub=gene,
                              obj=pheno_id,
                              pred=relation)
            assoc_id = g2p_assoc.make_association_id('yeastgenome.org', gene,
                                                     relation, pheno_id)
            g2p_assoc.set_association_id(assoc_id=assoc_id)

        # add to graph to mint assoc id
        g2p_assoc.add_association_to_graph()

        model.addLabel(subject_id=gene, label=record['Gene Name'])

        # add the association triple
        model.addTriple(subject_id=gene, predicate_id=relation, obj=pheno_id)

        model.addTriple(subject_id=pheno_id,
                        predicate_id=self.globaltt['subclass_of'],
                        obj=self.globaltt['phenotype'])

        # label nodes
        # pheno label

        model.addLabel(subject_id=pheno_id, label=pheno_label)

        g2p_assoc.description = self._make_description(record)

        # add the references
        references = record['Reference']
        references = references.replace(' ', '')
        references = references.split('|')

        #  created Ref prefix in curie map to route to proper reference URL in SGD
        if len(references) > 0:
            # make first ref in list the source
            g2p_assoc.add_source(identifier=references[0])
            ref_model = Reference(self.graph, references[0],
                                  self.globaltt['publication'])
            ref_model.addRefToGraph()

        if len(references) > 1:
            # create equivalent source for any other refs in list
            for ref in references[1:]:
                model.addSameIndividual(sub=references[0], obj=ref)

        # add experiment type as evidence
        for exp_type in record['experiment_type']:
            g2p_assoc.add_evidence(exp_type['id'])
            model.addLabel(subject_id=exp_type['id'], label=exp_type['term'])

        try:
            g2p_assoc.add_association_to_graph()
        except Exception as e:
            print(e)
        return

Esempio n. 16

Mostra file

    def _process_straininfo(self, limit):

        src_key = 'straininfo'
        raw = '/'.join((self.rawdir, self.files[src_key]['file']))
        LOG.info('Processing measurementsfrom file: %s', raw)

        if self.test_mode:
            graph = self.testgraph
        else:
            graph = self.graph
        model = Model(graph)
        tax_id = self.globaltt['Mus musculus']
        col = self.files[src_key]['columns']

        with open(raw, 'r') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='\"')
            row = next(reader)
            if self.check_fileheader(col, row):
                pass
            for row in reader:
                if not row:
                    continue  # skip blank rows
                strain_name = row[col.index('strainname')]
                vendor = row[col.index('vendor')]
                stocknum = row[col.index('stocknum')]
                panel = row[col.index('panel')]
                mpd_strainid = str(row[col.index('mpd_strainid')])
                # straintype = row[col.index('straintype')]
                # n_proj = row[col.index('n_proj')]
                # n_snp_datasets = row[col.index('n_snp_datasets')]
                mpdshortname = row[col.index('mpd_shortname')].strip()
                url = row[col.index('url')]  # new?

                # C57BL/6J,J,000664,,7,IN,225,17,,http://jaxmice.jax.org/strain/000664.html
                # create the strain as an instance of the taxon
                if self.test_mode and 'MPD:' + mpd_strainid not in self.test_ids:
                    continue

                strain_id = 'MPD-strain:' + mpd_strainid
                model.addIndividualToGraph(strain_id, strain_name, tax_id)
                if mpdshortname != '':
                    model.addSynonym(strain_id, mpdshortname)

                self.idlabel_hash[strain_id] = strain_name
                # make it equivalent to the vendor+stock
                if stocknum != '':
                    if vendor == 'J':
                        jax_id = 'JAX:' + stocknum
                        model.addSameIndividual(strain_id, jax_id)
                    elif vendor == 'Rbrc':  # reiken
                        reiken_id = 'RBRC:' + stocknum
                        model.addSameIndividual(strain_id, reiken_id)
                    else:
                        if url != '':
                            model.addXref(strain_id, url, True)
                        if vendor != '':
                            model.addXref(
                                strain_id, ':'.join((vendor, stocknum)),
                                True
                            )

                # add the panel information
                if panel != '':
                    desc = panel + ' [panel]'
                    model.addDescription(strain_id, desc)

Esempio n. 17

Mostra file

File: SGD.py Progetto: DoctorBud/dipper

    def make_association(self, record):
        """
        contstruct the association
        :param record:
        :return: modeled association of  genotype to mammalian phenotype
        """
        # prep record
        # remove description and mapp Experiment Type to apo term
        experiment_type = record['Experiment Type'].split('(')[0]
        experiment_type = experiment_type.split(',')
        record['experiment_type'] = list()
        for exp_type in experiment_type:
            exp_type = exp_type.lstrip().rstrip()
            record['experiment_type'].append(
                {
                    'id': self.apo_term_id[exp_type],
                    'term': exp_type,
                })
        sgd_phenotype = record['Phenotype']
        pheno_obj = {
            'entity': {
                'term': None,
                'apo_id': None
            },
            'quality': {
                'term': None,
                'apo_id': None
            },
            'has_quality': False  # False = phenotype was descriptive and don't bother looking for a quality
        }
        phenotype = record['Phenotype']
        if ':' in phenotype:
            pheno_obj['has_quality'] = True
            ent_qual = sgd_phenotype.split(': ')
            entity = ent_qual[0]
            quality = ent_qual[1]
            pheno_obj['entity']['term'] = entity
            pheno_obj['entity']['apo_id'] = self.apo_term_id[entity]
            pheno_obj['quality']['term'] = quality
            pheno_obj['quality']['apo_id'] = self.apo_term_id[quality]
        else:
            pheno_obj['entity']['term'] = phenotype
            pheno_obj['entity']['apo_id'] = self.apo_term_id[phenotype]
        record['pheno_obj'] = pheno_obj

        # begin modeling
        model = Model(self.graph)

        # define the triple
        gene = 'SGD:{}'.format(record['SGDID'])
        relation = Model.object_properties['has_phenotype']  # has phenotype

        if record['pheno_obj']['has_quality']:
            pheno_label = '{0}:{1}'.format(
                record['pheno_obj']['entity']['term'],
                record['pheno_obj']['quality']['term'])
            pheno_id = 'MONARCH:{0}{1}'.format(
                record['pheno_obj']['entity']['apo_id'].replace(':', '_'),
                record['pheno_obj']['quality']['apo_id'].replace(':', '_')
            )
            g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation)
        else:
            pheno_label = record['pheno_obj']['entity']['term']
            pheno_id = record['pheno_obj']['entity']['apo_id']
            g2p_assoc = Assoc(self.graph, self.name, sub=gene, obj=pheno_id, pred=relation)
            assoc_id = g2p_assoc.make_association_id(definedby='yeastgenome.org', subject=gene, predicate=relation,
                                                     object=pheno_id)
            g2p_assoc.set_association_id(assoc_id=assoc_id)

        # add to graph to mint assoc id
        g2p_assoc.add_association_to_graph()

        model.addLabel(subject_id=gene, label=record['Gene Name'])

        # add the association triple
        model.addTriple(subject_id=gene, predicate_id=relation, obj=pheno_id)

        # make pheno subclass of UPHENO:0001001
        model.addTriple(subject_id=pheno_id, predicate_id=Model.object_properties['subclass_of'], obj='UPHENO:0001001')

        # label nodes
        # pheno label
        model.addLabel(subject_id=pheno_id, label=pheno_label)

        g2p_assoc.description = self._make_description(record)

        # add the references
        references = record['Reference']
        references = references.replace(' ', '')
        references = references.split('|')

        #  created RGDRef prefix in curie map to route to proper reference URL in RGD
        if len(references) > 0:
            # make first ref in list the source
            g2p_assoc.add_source(identifier=references[0])
            ref_model = Reference(
                self.graph, references[0],
                Reference.ref_types['publication']
            )
            ref_model.addRefToGraph()

        if len(references) > 1:
            # create equivalent source for any other refs in list
            for ref in references[1:]:
                model.addSameIndividual(sub=references[0], obj=ref)

        # add experiment type as evidence
        for exp_type in record['experiment_type']:
            g2p_assoc.add_evidence(exp_type['id'])
            model.addLabel(subject_id=exp_type['id'], label=exp_type['term'])

        try:
            g2p_assoc.add_association_to_graph()
        except Exception as e:
            print(e)
        return

Esempio n. 18

Mostra file

File: UCSCBands.py Progetto: DoctorBud/dipper

    def _create_genome_builds(self):
        """
        Various resources will map variations to either UCSC (hg*)
        or to NCBI assemblies. Here we create the equivalences between them.
        Data taken from:
        https://genome.ucsc.edu/FAQ/FAQreleases.html#release1

        :return:

        """

        # TODO add more species
        ucsc_assembly_id_map = {
            "9606": {
                "UCSC:hg38": "NCBIGenome:GRCh38",
                "UCSC:hg19": "NCBIGenome:GRCh37",
                "UCSC:hg18": "NCBIGenome:36.1",
                "UCSC:hg17": "NCBIGenome:35",
                "UCSC:hg16": "NCBIGenome:34",
                "UCSC:hg15": "NCBIGenome:33",
                },
            "7955": {
                "UCSC:danRer10": "NCBIGenome:GRCz10",
                "UCSC:danRer7":	"NCBIGenome:Zv9",
                "UCSC:danRer6": "NCBIGenome:Zv8",
                },
            "10090": {
                "UCSC:mm10": "NCBIGenome:GRCm38",
                "UCSC:mm9":	"NCBIGenome:37"
            },
            "9031": {
                "UCSC:galGal4": "NCBIAssembly:317958",
                },
            "9913": {
                "UCSC:bosTau7": "NCBIAssembly:GCF_000003205.5",
                },
            "9823": {
                "UCSC:susScr3": "NCBIAssembly:304498",
                },
            "9940": {
                "UCSC:oviAri3": "NCBIAssembly:GCF_000298735.1",
                },
            "9796": {
                "UCSC:equCab2": "NCBIAssembly:GCF_000002305.2",
                }
        }
        g = self.graph
        geno = Genotype(g)
        model = Model(g)
        logger.info("Adding equivalent assembly identifiers")
        for sp in ucsc_assembly_id_map:
            tax_num = sp
            tax_id = 'NCBITaxon:'+tax_num
            mappings = ucsc_assembly_id_map[sp]
            for i in mappings:
                ucsc_id = i
                ucsc_label = re.split(':', i)[1]
                mapped_id = mappings[i]
                mapped_label = re.split(':', mapped_id)[1]
                mapped_label = 'NCBI build '+str(mapped_label)
                geno.addReferenceGenome(ucsc_id, ucsc_label, tax_id)
                geno.addReferenceGenome(mapped_id, mapped_label, tax_id)
                model.addSameIndividual(ucsc_id, mapped_id)

        return

Esempio n. 19

Mostra file

File: OMIM.py Progetto: nicholsn/dipper

    def _get_process_allelic_variants(self, entry, graph):
        model = Model(graph)
        reference = Reference(graph)
        geno = Genotype(graph)
        if entry is not None:
            # to hold the entry-specific publication mentions
            # for the allelic variants
            publist = {}
            entry_num = entry['mimNumber']

            # process the ref list just to get the pmids
            ref_to_pmid = self._get_pubs(entry, graph)

            if 'allelicVariantList' in entry:
                for alv in entry['allelicVariantList']:
                    al_num = alv['allelicVariant']['number']
                    al_id = 'OMIM:' + str(entry_num) + '.' + str(al_num).zfill(
                        4)
                    al_label = None
                    al_description = None
                    if alv['allelicVariant']['status'] == 'live':
                        publist[al_id] = set()
                        if 'mutations' in alv['allelicVariant']:
                            al_label = alv['allelicVariant']['mutations']
                        if 'text' in alv['allelicVariant']:
                            al_description = alv['allelicVariant']['text']
                            mch = re.findall(r'\{(\d+)\:', al_description)
                            publist[al_id] = set(mch)
                        geno.addAllele(al_id, al_label,
                                       self.globaltt['variant_locus'],
                                       al_description)
                        geno.addAlleleOfGene(al_id, 'OMIM:' + str(entry_num),
                                             self.globaltt['is_allele_of'])
                        for ref in publist[al_id]:
                            pmid = ref_to_pmid[int(ref)]
                            graph.addTriple(pmid, self.globaltt['is_about'],
                                            al_id)

                        # look up the pubmed id in the list of references
                        if 'dbSnps' in alv['allelicVariant']:
                            dbsnp_ids = re.split(
                                r',', alv['allelicVariant']['dbSnps'])
                            for dnum in dbsnp_ids:
                                did = 'dbSNP:' + dnum.strip()
                                model.addIndividualToGraph(did, None)
                                model.addSameIndividual(al_id, did)

                        # Note that RCVs are variant to disease associations
                        # in ClinVar, rather than variant entries
                        # so we make these xrefs instead of equivalents
                        if 'clinvarAccessions' in alv['allelicVariant']:
                            # clinvarAccessions triple semicolon delimited
                            # each >1 like RCV000020059;;;
                            rcv_ids = \
                                alv['allelicVariant']['clinvarAccessions'].split(';;;')
                            rcv_ids = [rcv[:12]
                                       for rcv in rcv_ids]  # incase more cruft

                            for rnum in rcv_ids:
                                rid = 'ClinVar:' + rnum
                                model.addXref(al_id, rid)
                        reference.addPage(
                            al_id, "http://omim.org/entry/" + '#'.join(
                                (str(entry_num), str(al_num).zfill(4))))
                    elif re.search(r'moved', alv['allelicVariant']['status']):
                        # for both 'moved' and 'removed'
                        moved_ids = None
                        if 'movedTo' in alv['allelicVariant']:
                            moved_id = 'OMIM:' + alv['allelicVariant'][
                                'movedTo']
                            moved_ids = [moved_id]
                        model.addDeprecatedIndividual(
                            al_id,
                            moved_ids,
                            old_id_category=blv.terms['SequenceVariant'])
                    else:
                        LOG.error('Uncaught alleleic variant status %s',
                                  alv['allelicVariant']['status'])

Esempio n. 20

Mostra file

File: OMIM.py Progetto: kshefchek/dipper

    def _get_process_allelic_variants(self, entry, g):
        model = Model(g)
        reference = Reference(g)
        geno = Genotype(g)
        if entry is not None:
            # to hold the entry-specific publication mentions
            # for the allelic variants
            publist = {}
            entry_num = entry['mimNumber']

            # process the ref list just to get the pmids
            ref_to_pmid = self._get_pubs(entry, g)

            if 'allelicVariantList' in entry:
                allelicVariantList = entry['allelicVariantList']
                for al in allelicVariantList:
                    al_num = al['allelicVariant']['number']
                    al_id = 'OMIM:'+str(entry_num)+'.'+str(al_num).zfill(4)
                    al_label = None
                    al_description = None
                    if al['allelicVariant']['status'] == 'live':
                        publist[al_id] = set()
                        if 'mutations' in al['allelicVariant']:
                            al_label = al['allelicVariant']['mutations']
                        if 'text' in al['allelicVariant']:
                            al_description = al['allelicVariant']['text']
                            m = re.findall(r'\{(\d+)\:', al_description)
                            publist[al_id] = set(m)
                        geno.addAllele(
                            al_id, al_label, geno.genoparts['variant_locus'],
                            al_description)
                        geno.addAlleleOfGene(
                            al_id, 'OMIM:'+str(entry_num),
                            geno.object_properties[
                                'is_sequence_variant_instance_of'])
                        for r in publist[al_id]:
                            pmid = ref_to_pmid[int(r)]
                            g.addTriple(
                                pmid, model.object_properties['is_about'],
                                al_id)
                        # look up the pubmed id in the list of references
                        if 'dbSnps' in al['allelicVariant']:
                            dbsnp_ids = \
                                re.split(r',', al['allelicVariant']['dbSnps'])
                            for dnum in dbsnp_ids:
                                did = 'dbSNP:'+dnum.strip()
                                model.addIndividualToGraph(did, None)
                                model.addSameIndividual(al_id, did)
                        if 'clinvarAccessions' in al['allelicVariant']:
                            # clinvarAccessions triple semicolon delimited
                            # each >1 like RCV000020059;;;
                            rcv_ids = \
                                re.split(
                                    r';;;',
                                    al['allelicVariant']['clinvarAccessions'])
                            rcv_ids = [
                                (re.match(r'(RCV\d+);*', r)).group(1)
                                for r in rcv_ids]
                            for rnum in rcv_ids:
                                rid = 'ClinVar:'+rnum
                                model.addXref(al_id, rid)
                        reference.addPage(
                            al_id, "http://omim.org/entry/" +
                            str(entry_num)+"#" + str(al_num).zfill(4))
                    elif re.search(
                            r'moved', al['allelicVariant']['status']):
                        # for both 'moved' and 'removed'
                        moved_ids = None
                        if 'movedTo' in al['allelicVariant']:
                            moved_id = 'OMIM:'+al['allelicVariant']['movedTo']
                            moved_ids = [moved_id]
                        model.addDeprecatedIndividual(al_id, moved_ids)
                    else:
                        logger.error('Uncaught alleleic variant status %s',
                                     al['allelicVariant']['status'])
                # end loop allelicVariantList

        return

Esempio n. 21

Mostra file

File: UDP.py Progetto: sgml/dipper

    def _parse_patient_variants(self, file):
        """
        :param file: file handler
        :return:
        """
        patient_var_map = self._convert_variant_file_to_dict(file)
        gene_coordinate_map = self._parse_gene_coordinates(
            self.map_files['gene_coord_map'])
        rs_map = self._parse_rs_map_file(self.map_files['dbsnp_map'])

        genotype = Genotype(self.graph)
        model = Model(self.graph)

        self._add_variant_gene_relationship(patient_var_map,
                                            gene_coordinate_map)

        for patient in patient_var_map:
            patient_curie = ':{0}'.format(patient)
            # make intrinsic genotype for each patient
            intrinsic_geno_bnode = self.make_id(
                "{0}-intrinsic-genotype".format(patient), "_")
            genotype_label = "{0} genotype".format(patient)
            genotype.addGenotype(intrinsic_geno_bnode, genotype_label,
                                 model.globaltt['intrinsic_genotype'])

            self.graph.addTriple(patient_curie, model.globaltt['has_genotype'],
                                 intrinsic_geno_bnode)
            for variant_id, variant in patient_var_map[patient].items():
                build = variant['build']
                chromosome = variant['chromosome']
                position = variant['position']
                reference_allele = variant['reference_allele']
                variant_allele = variant['variant_allele']
                genes_of_interest = variant['genes_of_interest']
                rs_id = variant['rs_id']

                variant_label = ''
                variant_bnode = self.make_id("{0}".format(variant_id), "_")

                # maybe should have these look like the elif statements below
                if position and reference_allele and variant_allele:
                    variant_label = self._build_variant_label(
                        build, chromosome, position, reference_allele,
                        variant_allele, genes_of_interest)
                elif not position and reference_allele and variant_allele \
                        and len(genes_of_interest) == 1:

                    variant_label = self._build_variant_label(
                        build, chromosome, position, reference_allele,
                        variant_allele, genes_of_interest)
                elif position and (not reference_allele or not variant_allele) \
                        and len(genes_of_interest) == 1:

                    variant_label = "{0}{1}({2}):g.{3}".format(
                        build, chromosome, genes_of_interest[0], position)
                elif len(genes_of_interest) == 1:
                    variant_label = 'variant of interest in {0} gene of patient' \
                        ' {1}'.format(genes_of_interest[0], patient)
                else:
                    variant_label = 'variant of interest in patient {0}'.format(
                        patient)

                genotype.addSequenceAlteration(variant_bnode, None)
                # check if it we have built the label
                # in _add_variant_gene_relationship()
                labels = self.graph.objects(
                    BNode(re.sub(r'^_:', '', variant_bnode, 1)), RDFS['label'])

                label_list = list(labels)

                if len(label_list) == 0:
                    model.addLabel(variant_bnode, variant_label)

                self.graph.addTriple(variant_bnode, self.globaltt['in taxon'],
                                     self.globaltt['H**o sapiens'])
                self.graph.addTriple(intrinsic_geno_bnode,
                                     self.globaltt['has_variant_part'],
                                     variant_bnode)
                if rs_id:
                    dbsnp_curie = 'dbSNP:{0}'.format(rs_id)
                    model.addSameIndividual(variant_bnode, dbsnp_curie)

        self._add_variant_sameas_relationships(patient_var_map, rs_map)
        return

Esempio n. 22

Mostra file

File: ClinVar.py Progetto: DoctorBud/dipper

    def _get_variants(self, limit):
        """
        Currently loops through the variant_summary file.

        :param limit:
        :return:

        """

        if self.testMode:
            g = self.testgraph
        else:
            g = self.graph

        model = Model(g)

        geno = Genotype(g)
        f = Feature(g, None, None, None)

        # add the taxon and the genome
        tax_num = '9606'  # HARDCODE
        tax_id = 'NCBITaxon:'+tax_num
        tax_label = 'Human'
        model.addClassToGraph(tax_id, None)
        geno.addGenome(tax_id, tax_label)  # label gets added elsewhere

        # not unzipping the file
        logger.info("Processing Variant records")
        line_counter = 0
        myfile = '/'.join((self.rawdir, self.files['variant_summary']['file']))
        with gzip.open(myfile, 'rb') as f:
            for line in f:
                # skip comments
                line = line.decode().strip()
                if re.match(r'^#', line):
                    continue

                # AlleleID               integer value as stored in the AlleleID field in ClinVar  (//Measure/@ID in the XML)
                # Type                   character, the type of variation
                # Name                   character, the preferred name for the variation
                # GeneID                 integer, GeneID in NCBI's Gene database
                # GeneSymbol             character, comma-separated list of GeneIDs overlapping the variation
                # ClinicalSignificance   character, comma-separated list of values of clinical significance reported for this variation
                #                          for the mapping between the terms listed here and the integers in the .VCF files, see
                #                          http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/
                # RS# (dbSNP)            integer, rs# in dbSNP
                # nsv (dbVar)            character, the NSV identifier for the region in dbVar
                # RCVaccession           character, list of RCV accessions that report this variant
                # TestedInGTR            character, Y/N for Yes/No if there is a test registered as specific to this variation in the NIH Genetic Testing Registry (GTR)
                # PhenotypeIDs           character, list of db names and identifiers for phenotype(s) reported for this variant
                # Origin                 character, list of all allelic origins for this variation
                # Assembly               character, name of the assembly on which locations are based
                # Chromosome             character, chromosomal location
                # Start                  integer, starting location, in pter->qter orientation
                # Stop                   integer, end location, in pter->qter orientation
                # Cytogenetic            character, ISCN band
                # ReviewStatus           character, highest review status for reporting this measure. For the key to the terms,
                #                            and their relationship to the star graphics ClinVar displays on its web pages,
                #                            see http://www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#interpretation
                # HGVS(c.)               character, RefSeq cDNA-based HGVS expression
                # HGVS(p.)               character, RefSeq protein-based HGVS expression
                # NumberSubmitters       integer, number of submissions with this variant
                # LastEvaluated          datetime, the latest time any submitter reported clinical significance
                # Guidelines             character, ACMG only right now, for the reporting of incidental variation in a Gene
                #                                (NOTE: if ACMG, not a specific to the allele but to the Gene)
                # OtherIDs               character, list of other identifiers or sources of information about this variant
                # VariantID              integer, the value used to build the URL for the current default report,
                #                            e.g. http://www.ncbi.nlm.nih.gov/clinvar/variation/1756/
                #

                # a crude check that there's an expected number of cols.
                # if not, error out because something changed.
                num_cols = len(line.split('\t'))
                expected_numcols = 29
                if num_cols != expected_numcols:
                    logger.error(
                        "Unexpected number of columns in raw file " +
                        "(%d actual vs %d expected)",
                        num_cols, expected_numcols)

                (allele_num, allele_type, allele_name, gene_num, gene_symbol,
                 clinical_significance, dbsnp_num, dbvar_num, rcv_nums,
                 tested_in_gtr, phenotype_ids, origin, assembly, chr, start,
                 stop, cytogenetic_loc, review_status, hgvs_c, hgvs_p,
                 number_of_submitters, last_eval, guidelines, other_ids,
                 variant_num, reference_allele, alternate_allele, categories,
                 ChromosomeAccession) = line.split('\t')

                # ###set filter=None in init if you don't want to have a filter
                # if self.filter is not None:
                #    if ((self.filter == 'taxids' and\
                #            (int(tax_num) not in self.tax_ids)) or\
                #            (self.filter == 'geneids' and\
                #             (int(gene_num) not in self.gene_ids))):
                #        continue
                # #### end filter

                line_counter += 1

                pheno_list = []
                if phenotype_ids != '-':
                    # trim any leading/trailing semicolons/commas
                    phenotype_ids = re.sub(r'^[;,]', '', phenotype_ids)
                    phenotype_ids = re.sub(r'[;,]$', '', phenotype_ids)
                    pheno_list = re.split(r'[,;]', phenotype_ids)

                if self.testMode:
                    # get intersection of test disease ids
                    # and these phenotype_ids
                    intersect = \
                        list(
                            set([str(i)
                                for i in self.disease_ids]) & set(pheno_list))
                    if int(gene_num) not in self.gene_ids and\
                            int(variant_num) not in self.variant_ids and\
                            len(intersect) < 1:
                        continue

                # TODO may need to switch on assembly to create correct
                # assembly/build identifiers
                build_id = ':'.join(('NCBIGenome', assembly))

                # make the reference genome build
                geno.addReferenceGenome(build_id, assembly, tax_id)

                allele_type_id = self._map_type_of_allele(allele_type)
                bandinbuild_id = None
                if str(chr) == '':
                    # check cytogenic location
                    if str(cytogenetic_loc).strip() != '':
                        # use cytogenic location to get the apx location
                        # oddly, they still put an assembly number even when
                        # there's no numeric location
                        if not re.search(r'-', str(cytogenetic_loc)):
                            band_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)),
                                tax_num, 'CHR')
                            geno.addChromosomeInstance(
                                cytogenetic_loc, build_id, assembly, band_id)
                            bandinbuild_id = makeChromID(
                                re.split(r'-', str(cytogenetic_loc)),
                                assembly, 'MONARCH')
                        else:
                            # can't deal with ranges yet
                            pass
                else:
                    # add the human chromosome class to the graph,
                    # and add the build-specific version of it
                    chr_id = makeChromID(str(chr), tax_num, 'CHR')
                    geno.addChromosomeClass(str(chr), tax_id, tax_label)
                    geno.addChromosomeInstance(
                        str(chr), build_id, assembly, chr_id)
                    chrinbuild_id = makeChromID(str(chr), assembly, 'MONARCH')

                seqalt_id = ':'.join(('ClinVarVariant', variant_num))
                gene_id = None

                # they use -1 to indicate unknown gene
                if str(gene_num) != '-1' and str(gene_num) != 'more than 10':
                    if re.match(r'^Gene:', gene_num):
                        gene_num = "NCBI" + gene_num
                    else:
                        gene_id = ':'.join(('NCBIGene', str(gene_num)))

                # FIXME there are some "variants" that are actually haplotypes
                # probably will get taken care of when we switch to processing
                # the xml for example, variant_num = 38562
                # but there's no way to tell if it's a haplotype
                # in the csv data so the dbsnp or dbvar
                # should probably be primary,
                # and the variant num be the vslc,
                # with each of the dbsnps being added to it

                # TODO clinical significance needs to be mapped to
                # a list of terms
                # first, make the variant:
                f = Feature(seqalt_id, allele_name, allele_type_id)

                if start != '-' and start.strip() != '':
                    f.addFeatureStartLocation(start, chrinbuild_id)
                if stop != '-' and stop.strip() != '':
                    f.addFeatureEndLocation(stop, chrinbuild_id)

                f.addFeatureToGraph()
                f.addTaxonToFeature(tax_id)
                # make the ClinVarVariant the clique leader
                model.makeLeader(seqalt_id)

                if bandinbuild_id is not None:
                    f.addSubsequenceOfFeature(bandinbuild_id)

                # CHECK - this makes the assumption that there is
                # only one affected chromosome per variant what happens with
                # chromosomal rearrangement variants?
                # shouldn't both chromosomes be here?

                # add the hgvs as synonyms
                if hgvs_c != '-' and hgvs_c.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_c)
                if hgvs_p != '-' and hgvs_p.strip() != '':
                    model.addSynonym(seqalt_id, hgvs_p)

                # add the dbsnp and dbvar ids as equivalent
                if dbsnp_num != '-' and int(dbsnp_num) != -1:
                    dbsnp_id = 'dbSNP:rs'+str(dbsnp_num)
                    model.addIndividualToGraph(dbsnp_id, None)
                    model.addSameIndividual(seqalt_id, dbsnp_id)
                if dbvar_num != '-':
                    dbvar_id = 'dbVar:'+dbvar_num
                    model.addIndividualToGraph(dbvar_id, None)
                    model.addSameIndividual(seqalt_id, dbvar_id)

                # TODO - not sure if this is right... add as xref?
                # the rcv is like the combo of the phenotype with the variant
                if rcv_nums != '-':
                    for rcv_num in re.split(r';', rcv_nums):
                        rcv_id = 'ClinVar:' + rcv_num
                        model.addIndividualToGraph(rcv_id, None)
                        model.addXref(seqalt_id, rcv_id)

                if gene_id is not None:
                    # add the gene
                    model.addClassToGraph(gene_id, gene_symbol)
                    # make a variant locus
                    vl_id = '_'+gene_num+'-'+variant_num
                    if self.nobnodes:
                        vl_id = ':'+vl_id
                    vl_label = allele_name
                    model.addIndividualToGraph(
                        vl_id, vl_label, geno.genoparts['variant_locus'])
                    geno.addSequenceAlterationToVariantLocus(seqalt_id, vl_id)
                    geno.addAlleleOfGene(vl_id, gene_id)
                else:
                    # some basic reporting
                    gmatch = re.search(r'\(\w+\)', allele_name)
                    if gmatch is not None and len(gmatch.groups()) > 0:
                        logger.info(
                            "Gene found in allele label, but no id provided: %s",
                            gmatch.group(1))
                    elif re.match(r'more than 10', gene_symbol):
                        logger.info(
                            "More than 10 genes found; "
                            "need to process XML to fetch (variant=%d)",
                            int(variant_num))
                    else:
                        logger.info(
                            "No gene listed for variant %d",
                            int(variant_num))

                # parse the list of "phenotypes" which are diseases.
                # add them as an association
                # ;GeneReviews:NBK1440,MedGen:C0392514,OMIM:235200,SNOMED CT:35400008;MedGen:C3280096,OMIM:614193;MedGen:CN034317,OMIM:612635;MedGen:CN169374
                # the list is both semicolon delimited and comma delimited,
                # but i don't know why! some are bad, like:
                # Orphanet:ORPHA ORPHA319705,SNOMED CT:49049000
                if phenotype_ids != '-':
                    for phenotype in pheno_list:
                        m = re.match(
                            r"(Orphanet:ORPHA(?:\s*ORPHA)?)", phenotype)
                        if m is not None and len(m.groups()) > 0:
                            phenotype = re.sub(
                                m.group(1), 'Orphanet:', phenotype.strip())
                        elif re.match(r'ORPHA:\d+', phenotype):
                            phenotype = re.sub(
                                r'^ORPHA', 'Orphanet', phenotype.strip())
                        elif re.match(r'Human Phenotype Ontology', phenotype):
                            phenotype = re.sub(
                                r'^Human Phenotype Ontology', '',
                                phenotype.strip())
                        elif re.match(r'SNOMED CT:\s?', phenotype):
                            phenotype = re.sub(
                                r'SNOMED CT:\s?', 'SNOMED:', phenotype.strip())
                        elif re.match(r'^Gene:', phenotype):
                            continue

                        assoc = G2PAssoc(
                            g, self.name, seqalt_id, phenotype.strip())
                        assoc.add_association_to_graph()

                if other_ids != '-':
                    id_list = other_ids.split(',')
                    # process the "other ids" ex:
                    # CFTR2:F508del,HGMD:CD890142,OMIM Allelic Variant:602421.0001
                    # TODO make more xrefs
                    for xrefid in id_list:
                        prefix = xrefid.split(':')[0].strip()
                        if prefix == 'OMIM Allelic Variant':
                            xrefid = 'OMIM:'+xrefid.split(':')[1]
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'HGMD':
                            model.addIndividualToGraph(xrefid, None)
                            model.addSameIndividual(seqalt_id, xrefid)
                        elif prefix == 'dbVar' \
                                and dbvar_num == xrefid.split(':')[1].strip():
                            pass  # skip over this one
                        elif re.search(r'\s', prefix):
                            pass
                            # logger.debug(
                            #   'xref prefix has a space: %s', xrefid)
                        else:
                            # should be a good clean prefix
                            # note that HGMD variants are in here as Xrefs
                            # because we can't resolve URIs for them
                            # logger.info("Adding xref: %s", xrefid)
                            # gu.addXref(g, seqalt_id, xrefid)
                            # logger.info("xref prefix to add: %s", xrefid)
                            pass

                if not self.testMode and limit is not None \
                        and line_counter > limit:
                    break

        logger.info("Finished parsing variants")

        return

Esempio n. 23

Mostra file

File: OMIM.py Progetto: putmantime/dipper

    def _get_process_allelic_variants(self, entry, g):
        model = Model(g)
        reference = Reference(g)
        geno = Genotype(g)
        if entry is not None:
            # to hold the entry-specific publication mentions
            # for the allelic variants
            publist = {}
            entry_num = entry['mimNumber']

            # process the ref list just to get the pmids
            ref_to_pmid = self._get_pubs(entry, g)

            if 'allelicVariantList' in entry:
                allelicVariantList = entry['allelicVariantList']
                for al in allelicVariantList:
                    al_num = al['allelicVariant']['number']
                    al_id = 'OMIM:' + str(entry_num) + '.' + str(al_num).zfill(
                        4)
                    al_label = None
                    al_description = None
                    if al['allelicVariant']['status'] == 'live':
                        publist[al_id] = set()
                        if 'mutations' in al['allelicVariant']:
                            al_label = al['allelicVariant']['mutations']
                        if 'text' in al['allelicVariant']:
                            al_description = al['allelicVariant']['text']
                            m = re.findall(r'\{(\d+)\:', al_description)
                            publist[al_id] = set(m)
                        geno.addAllele(al_id, al_label,
                                       geno.genoparts['variant_locus'],
                                       al_description)
                        geno.addAlleleOfGene(
                            al_id, 'OMIM:' + str(entry_num),
                            geno.object_properties[
                                'is_sequence_variant_instance_of'])
                        for r in publist[al_id]:
                            pmid = ref_to_pmid[int(r)]
                            g.addTriple(pmid,
                                        model.object_properties['is_about'],
                                        al_id)
                        # look up the pubmed id in the list of references
                        if 'dbSnps' in al['allelicVariant']:
                            dbsnp_ids = \
                                re.split(r',', al['allelicVariant']['dbSnps'])
                            for dnum in dbsnp_ids:
                                did = 'dbSNP:' + dnum.strip()
                                model.addIndividualToGraph(did, None)
                                model.addSameIndividual(al_id, did)
                        if 'clinvarAccessions' in al['allelicVariant']:
                            # clinvarAccessions triple semicolon delimited
                            # each >1 like RCV000020059;;;
                            rcv_ids = \
                                re.split(
                                    r';;;',
                                    al['allelicVariant']['clinvarAccessions'])
                            rcv_ids = [(re.match(r'(RCV\d+);*', r)).group(1)
                                       for r in rcv_ids]
                            for rnum in rcv_ids:
                                rid = 'ClinVar:' + rnum
                                model.addXref(al_id, rid)
                        reference.addPage(
                            al_id, "http://omim.org/entry/" + str(entry_num) +
                            "#" + str(al_num).zfill(4))
                    elif re.search(r'moved', al['allelicVariant']['status']):
                        # for both 'moved' and 'removed'
                        moved_ids = None
                        if 'movedTo' in al['allelicVariant']:
                            moved_id = 'OMIM:' + al['allelicVariant']['movedTo']
                            moved_ids = [moved_id]
                        model.addDeprecatedIndividual(al_id, moved_ids)
                    else:
                        logger.error('Uncaught alleleic variant status %s',
                                     al['allelicVariant']['status'])
                # end loop allelicVariantList

        return