コード例 #1
0
    def homologene_uniprot_dict(self, source):
        """
        Builds orthology translation table as dict from UniProt to Uniprot,
        obtained from NCBI HomoloGene data. Uses RefSeq and Entrez IDs for
        translation.
        """

        source = self.get_source(source)

        self.h**o[source] = {}

        hge = dataio.homologene_dict(source, self.target, 'entrez')
        hgr = dataio.homologene_dict(source, self.target, 'refseq')

        self.load_proteome(source, self.only_swissprot)

        for u in self._proteomes[(source, self.only_swissprot)]:

            source_e = mapping.map_name(u, 'uniprot', 'entrez', source)
            source_r = mapping.map_name(u, 'uniprot', 'refseqp', source)
            target_u = set([])
            target_r = set([])
            target_e = set([])

            for e in source_e:
                if e in hge:
                    target_e.update(hge[e])

            for r in source_r:
                if r in hgr:
                    target_r.update(hgr[r])

            for e in target_e:
                target_u.update(
                    set(mapping.map_name(e, 'entrez', 'uniprot', self.target)))

            for r in target_r:
                target_u.update(
                    set(mapping.map_name(e, 'refseqp', 'uniprot',
                                         self.target)))

            target_u = \
                itertools.chain(
                    *map(
                        lambda tu:
                            mapping.map_name(
                                tu, 'uniprot', 'uniprot', self.target),
                        target_u
                    )
                )

            self.h**o[source][u] = sorted(list(target_u))
コード例 #2
0
    def build_gene(self):

        self.gene = set()

        for entity in self._entities:

            # we add the components of the complexes to the protein data
            # frame; I don't know if it's necessary but does not harm I guess
            if hasattr(entity, 'components'):

                components = entity.components

            else:

                components = (entity, )

            for comp in components:

                name = mapping.map_name0(comp, 'uniprot', 'genesymbol')
                ensembl_genes = mapping.map_name(comp, 'uniprot', 'ensembl')

                for ensembl in ensembl_genes:

                    self.gene.add(
                        CellPhoneDBGene(
                            gene_name=name,
                            uniprot=comp,
                            hgnc_symbol=name,
                            ensembl=ensembl,
                        ))
コード例 #3
0
    def _process(self, p):

        # human leukocyte antigenes result a result an
        # extremely high number of combinations
        if (not p['kinase'] or (isinstance(p['substrate'], common.basestring)
                                and p['substrate'].startswith('HLA'))):

            return

        if not isinstance(p['kinase'], list):
            p['kinase'] = [p['kinase']]

        kinase_ups = mapping.map_names(
            p['kinase'],
            self.enzyme_id_type,
            'uniprot',
            ncbi_tax_id=self.ncbi_tax_id,
        )

        substrate_ups_all = set([])

        for sub_id_type in (self.substrate_id_types[self.input_method.lower()]
                            if self.input_is(self.substrate_id_types,
                                             '__contains__') else
                            [self.substrate_id_type]):

            if type(sub_id_type) is tuple:
                sub_id_type, sub_id_attr = sub_id_type
            else:
                sub_id_attr = 'substrate'

            substrate_ups_all.update(
                set(
                    mapping.map_name(
                        p[sub_id_attr],
                        sub_id_type,
                        'uniprot',
                        self.ncbi_tax_id,
                    )))

        # looking up sequences in all isoforms:
        substrate_ups = []

        for s in substrate_ups_all:

            if 'substrate_isoform' in p and p['substrate_isoform']:

                substrate_ups.append((s, p['substrate_isoform']))

            else:

                se = self.get_seq(s)

                if se is None:
                    continue

                for isof in se.isoforms():

                    if p['instance'] is not None:

                        if se.match(p['instance'],
                                    p['start'],
                                    p['end'],
                                    isoform=isof):

                            substrate_ups.append((s, isof))

                    else:

                        if se.match(p['resaa'], p['resnum'], isoform=isof):

                            substrate_ups.append((s, isof))

        if self.trace:

            if p['substrate'] not in self.sub_ambig:

                self.sub_ambig[p['substrate']] = substrate_ups

            for k in p['kinase']:

                if k not in self.kin_ambig:

                    self.kin_ambig[k] = kinase_ups
            # generating report on non matching substrates
            if len(substrate_ups) == 0:

                for s in substrate_ups_all:

                    se = self.get_seq(s[0])

                    if se is None:
                        continue

                    nomatch.append(
                        (s[0], s[1], ((p['substrate_refseq']
                                       if 'substrate_refseq' in p else ''), s,
                                      p['instance'],
                                      se.get(p['start'], p['end']))))

        # adding kinase-substrate interactions

        for k in kinase_ups:

            for s in substrate_ups:

                if (not self.allow_mixed_organisms
                        and (self.get_taxon(k) != self.ncbi_tax_id
                             or self.get_taxon(s[0]) != self.ncbi_tax_id)):
                    continue

                se = self.get_seq(s[0])

                if se is None:
                    continue

                res = intera.Residue(p['resnum'],
                                     p['resaa'],
                                     s[0],
                                     isoform=s[1])

                if p['instance'] is None:

                    reg = se.get_region(p['resnum'],
                                        p['start'],
                                        p['end'],
                                        isoform=s[1])

                    if reg is not None:

                        p['instance'] = reg[2]
                        p['start'] = reg[0]
                        p['end'] = reg[1]

                if 'typ' not in p:
                    p['typ'] = 'phosphorylation'

                mot = intera.Motif(s[0],
                                   p['start'],
                                   p['end'],
                                   instance=p['instance'],
                                   isoform=s[1])

                ptm = intera.Ptm(s[0],
                                 motif=mot,
                                 residue=res,
                                 typ=p['typ'],
                                 source=[self.name],
                                 isoform=s[1])

                dom = intera.Domain(protein=k)

                if 'references' not in p:
                    p['references'] = []

                dommot = intera.DomainMotif(domain=dom,
                                            ptm=ptm,
                                            sources=[self.name],
                                            refs=p['references'])

                if self.input_is('mimp'):
                    dommot.mimp_sources = ';'.split(p['databases'])
                    dommot.npmid = p['npmid']

                elif self.input_is('phosphonetworks'):
                    dommot.pnetw_score = p['score']

                elif self.input_is('dbptm'):
                    dommot.dbptm_sources = [p['source']]

                yield dommot