Example #1
0
def li2012_dmi():
    """
    Converts table read by ``pypath.inputs.li2012.get_li2012`` to
    list of ``pypath.internals.intera.DomainMotif`` objects.
    Translates GeneSymbols to UniProt IDs.
    """

    result = {}
    nondigit = re.compile(r'[^\d]+')
    se = uniprot_input.swissprot_seq(isoforms=True)
    data = get_li2012()

    for l in data:

        subs_protein = l[1].split('/')[0]
        tk_protein = l[2].split()[0]
        reader_protein = l[3].split()[0]
        subs_uniprots = mapping.map_name(
            subs_protein,
            'genesymbol',
            'uniprot',
        )
        tk_uniprots = mapping.map_name(tk_protein, 'genesymbol', 'uniprot')
        reader_uniprots = mapping.map_name(reader_protein, 'genesymbol',
                                           'uniprot')
        subs_resnum = int(non_digit.sub('', l[1].split('/')[1]))

        for su in subs_uniprots:
            if su in se:
                subs_iso = None
                for iso, s in iteritems(se[su].isof):
                    if se[su].get(subs_resnum, isoform=iso) == 'Y':
                        subs_iso = iso
                        break
                if subs_iso:
                    start = min(1, subs_resnum - 7)
                    end = max(subs_resnum + 7, len(se[su].isof[subs_iso]))
                    for ku in tk_uniprots:
                        res = intera.Residue(subs_resnum,
                                             'Y',
                                             su,
                                             isoform=subs_iso)
                        mot = intera.Motif(su,
                                           start,
                                           end,
                                           isoform=subs_iso,
                                           instance=se[su].get(
                                               start, end, isoform=subs_iso))
                        ptm = intera.Ptm(su,
                                         motif=mot,
                                         residue=res,
                                         isoform=subs_iso,
                                         source='Li2012')
                        dom = intera.Domain(ku)
                        dommot = intera.DomainMotif(domain=dom,
                                                    ptm=ptm,
                                                    sources=['Li2012'])
                        result = {}

    return result
Example #2
0
def phosphosite_ptms(organism='human'):
    """
    Downloads the phosphorylation site dataset from PhosphoSitePlus.
    """

    result = []
    url = urls.urls['psite_p']['url']
    nondigit = re.compile(r'[^\d]+')
    remot = re.compile(r'(_*)([A-Za-z]+)(_*)')

    c = curl.Curl(url, silent=False, large=True)
    data = c.result

    for _ in xrange(4):
        null = c.result.readline()

    for r in data:

        r = r.split('\t')

        if len(r) > 9 and (organism is None or r[6] == organism):

            uniprot = r[2]
            isoform = 1 if '-' not in uniprot else int(uniprot.split('-')[1])
            uniprot = uniprot.split('-')[0]
            typ = r[3].lower()
            if len(typ) == 0:
                typ = r[4].split('-')[1] if '-' in r[4] else None
            aa = r[4][0]
            num = int(nondigit.sub('', r[4]))
            motif = remot.match(r[9])
            if motif:
                start = num - 7 + len(motif.groups()[0])
                end = num + 7 - len(motif.groups()[2])
                instance = r[9].replace('_', '').upper()
            else:
                start = None
                end = None
                instance = None

            res = intera.Residue(num, aa, uniprot, isoform=isoform)
            mot = intera.Motif(uniprot,
                               start,
                               end,
                               instance=instance,
                               isoform=isoform)
            ptm = intera.Ptm(uniprot,
                             typ=typ,
                             motif=mot,
                             residue=res,
                             source='PhosphoSite',
                             isoform=isoform)
            result.append(ptm)

    return result
Example #3
0
    def translate_ptm(self, ptm):

        tptms = self.translate_site(
            ptm.protein,
            ptm.residue.name,
            ptm.residue.number,
            ptm.residue.isoform,
            ptm.typ,
        )

        result = []

        for x in tptms:

            se = self.get_seq(x[0])

            if (se is None or x[1] not in se.isof) and self.strict:
                continue

            res = intera.Residue(
                number=x[3],
                name=x[2],
                protein=x[0],
                isoform=x[1],
                ncbi_tax_id=self.target,
            )
            start, end, region = (se.get_region(x[3], isoform=x[1])
                                  if se is not None and x[1] in se.isof else
                                  (None, None, None))
            mot = intera.Motif(
                protein=x[0],
                start=start,
                end=end,
                instance=region,
                isoform=x[1],
                ncbi_tax_id=self.target,
            )

            ptm = intera.Ptm(
                protein=x[0],
                motif=mot,
                residue=res,
                typ=x[5],
                isoform=x[1],
                evidences=ptm.evidences,
                ncbi_tax_id=self.target,
            )

            result.append(ptm)

        return result
Example #4
0
    def _process(self, p):

        # human leukocyte antigenes result a result an
        # extremely high number of combinations
        if (not p['kinase'] or (isinstance(p['substrate'], common.basestring)
                                and p['substrate'].startswith('HLA'))):

            return

        if not isinstance(p['kinase'], list):
            p['kinase'] = [p['kinase']]

        kinase_ups = mapping.map_names(
            p['kinase'],
            self.id_type_enzyme,
            'uniprot',
            ncbi_tax_id=self.ncbi_tax_id,
        )

        substrate_ups_all = set()

        for sub_id_type in self.id_type_substrate:

            if isinstance(sub_id_type, (list, tuple)):
                sub_id_type, sub_id_attr = sub_id_type
            else:
                sub_id_attr = 'substrate'

            substrate_ups_all.update(
                set(
                    mapping.map_name(
                        p[sub_id_attr],
                        sub_id_type,
                        'uniprot',
                        self.ncbi_tax_id,
                    )))

        # looking up sequences in all isoforms:
        substrate_ups = []

        for s in substrate_ups_all:

            if 'substrate_isoform' in p and p['substrate_isoform']:

                substrate_ups.append((s, p['substrate_isoform']))

            else:

                se = self.get_seq(s)

                if se is None:
                    continue

                for isof in se.isoforms():

                    if 'instance' in p and p['instance'] is not None:

                        if se.match(
                                p['instance'],
                                p['start'],
                                p['end'],
                                isoform=isof,
                        ):

                            substrate_ups.append((s, isof))

                    else:

                        if se.match(
                                p['resaa'],
                                p['resnum'],
                                isoform=isof,
                        ):

                            substrate_ups.append((s, isof))

        if self.trace:

            if p['substrate'] not in self.sub_ambig:

                self.sub_ambig[p['substrate']] = substrate_ups

            for k in p['kinase']:

                if k not in self.kin_ambig:

                    self.kin_ambig[k] = kinase_ups
            # generating report on non matching substrates
            if len(substrate_ups) == 0:

                for s in substrate_ups_all:

                    se = self.get_seq(s[0])

                    if se is None:
                        continue

                    self.nomatch.append((
                        s[0],
                        s[1],
                        (
                            p['substrate_refseq']
                            if 'substrate_refseq' in p else '',
                            s,
                            p['instance'],
                            se.get(p['start'], p['end']),
                        ),
                    ))

        # building objects representing the enzyme-substrate interaction(s)

        if 'typ' not in p:
            p['typ'] = 'phosphorylation'

        _resources = tuple(
            (self.input_param.
             get_via(name) if hasattr(self.input_param, 'get_via') else name)
            for name in (p['databases'] if 'databases' in p else ()))
        _resources += ((self.name, ) if isinstance(
            self.input_param, common.basestring) else (self.input_param, ))

        # collecting the evidences
        evidences = evidence.Evidences(
            evidence.Evidence(resource=_res,
                              references=p['references'] if 'references' in
                              p else None) for _res in _resources)

        for s in substrate_ups:

            # building the objects representing the substrate
            se = self.get_seq(s[0])

            if se is None:
                continue

            res = intera.Residue(
                p['resnum'],
                p['resaa'],
                s[0],
                isoform=s[1],
                ncbi_tax_id=self.ncbi_tax_id,
            )

            if 'instance' not in p or p['instance'] is None:

                reg = se.get_region(
                    p['resnum'],
                    p['start'] if 'start' in p else None,
                    p['end'] if 'end' in p else None,
                    isoform=s[1],
                )

                if reg is not None:

                    p['start'], p['end'], p['instance'] = reg

            mot = intera.Motif(
                s[0],
                p['start'],
                p['end'],
                instance=p['instance'],
                isoform=s[1],
                ncbi_tax_id=self.ncbi_tax_id,
            )

            ptm = intera.Ptm(
                s[0],
                motif=mot,
                residue=res,
                typ=p['typ'],
                evidences=evidences,
                isoform=s[1],
                ncbi_tax_id=self.ncbi_tax_id,
            )

            for k in kinase_ups:

                if (not self.allow_mixed_organisms
                        and (self.get_taxon(k) != self.ncbi_tax_id
                             or self.get_taxon(s[0]) != self.ncbi_tax_id)):
                    continue

                # the enzyme (kinase)
                dom = intera.Domain(
                    protein=k,
                    ncbi_tax_id=self.ncbi_tax_id,
                )

                dommot = intera.DomainMotif(
                    domain=dom,
                    ptm=ptm,
                    evidences=evidences,
                )

                if hasattr(self.input_param, 'extra_attrs'):

                    for attr, key in iteritems(self.input_param.extra_attrs):

                        if key in p:

                            setattr(dommot, attr, p[key])

                yield dommot
Example #5
0
def phosphosite_enzyme_substrate(
    raw=True,
    organism='human',
    strict=True,
):
    """
    Downloads and preprocesses phosphorylation site data from PhosphoSitePlus.
    """

    url = urls.urls['psite_kin']['url']
    c = curl.Curl(
        url,
        silent=False,
        compr='gz',
        encoding='iso-8859-1',
        large=True,
    )
    orto = {}
    data = c.result
    cols = {
        'kinase': 2,
        'kinase_org': 3,
        'substrate': 6,
        'substrate_org': 8,
        'residue': 9,
        'motif': 11
    }
    data = inputs_common.read_table(
        cols=cols,
        fileObject=data,
        sep='\t',
        hdr=4,
    )
    result = []
    non_digit = re.compile(r'[^\d.-]+')
    motre = re.compile(r'(_*)([A-Za-z]+)(_*)')

    for r in data:

        if organism is None or \
            ((r['kinase_org'] == organism or not strict) and \
            r['substrate_org'] == organism):

            if r['kinase_org'] != organism:
                korg = r['kinase_org']
                # attempting to map by orthology:
                if korg in taxonomy.taxa and organism in taxonomy.taxa:

                    ktaxid = taxonomy.taxa[korg]
                    taxid = taxonomy.taxa[organism]

                    if korg not in orto:

                        orto[korg] = homology.homologene_dict(
                            ktaxid,
                            taxid,
                            'refseqp',
                        )

                    korg_refseq = mapping.map_name(r['kinase'], 'uniprot',
                                                   'refseqp', ktaxid)

                    kin_uniprot = \
                        list(
                            itertools.chain(
                                *map(
                                    lambda ors:
                                        mapping.map_name(ors,
                                                        'refseqp',
                                                        'uniprot',
                                                        taxid),
                                    itertools.chain(
                                        *map(
                                            lambda rs:
                                                orto[korg][rs],
                                            filter(
                                                lambda rs:
                                                    rs in orto[korg],
                                                korg_refseq
                                            )
                                        )
                                    )
                                )
                            )
                        )
            else:
                kin_uniprot = [r['kinase']]

            for kinase in kin_uniprot:

                r['resaa'] = r['residue'][0]
                r['resnum'] = int(non_digit.sub('', r['residue'][1:]))
                mot = motre.match(r['motif'])

                # excluding e.g. Q12809_VAR_014388
                r['substrate'] = r['substrate'].split('_')[0]
                sisoform = 1 if '-' not in r['substrate'] else \
                    int(r['substrate'].split('-')[1])
                r['substrate'] = r['substrate'].split('-')[0]

                kisoform = (1 if '-' not in kinase else int(
                    kinase.split('-')[1]))
                kinase = kinase.split('-')[0]

                r['substrate'] = r['substrate'].split('-')[0]

                if mot:
                    r['start'] = r['resnum'] - 7 + len(mot.groups()[0])
                    r['end'] = r['resnum'] + 7 - len(mot.groups()[2])
                    r['instance'] = r['motif'].replace('_', '').upper()
                else:
                    r['start'] = None
                    r['end'] = None
                    r['instance'] = None

                if raw:
                    r['kinase'] = kinase
                    result.append(r)
                else:
                    res = intera.Residue(r['resnum'],
                                         r['resaa'],
                                         r['substrate'],
                                         isoform=sisoform)

                    mot = intera.Motif(r['substrate'],
                                       r['start'],
                                       r['end'],
                                       instance=r['instance'],
                                       isoform=sisoform)

                    ptm = intera.Ptm(protein=r['substrate'],
                                     residue=res,
                                     motif=mot,
                                     typ='phosphorylation',
                                     source='PhosphoSite',
                                     isoform=sisoform)

                    dom = intera.Domain(protein=kinase, isoform=kisoform)

                    dommot = intera.DomainMotif(domain=dom,
                                                ptm=ptm,
                                                sources=['PhosphoSite'])

                    result.append(dommot)

    return result