def li2012_dmi(): """ Converts table read by ``pypath.inputs.li2012.get_li2012`` to list of ``pypath.internals.intera.DomainMotif`` objects. Translates GeneSymbols to UniProt IDs. """ result = {} nondigit = re.compile(r'[^\d]+') se = uniprot_input.swissprot_seq(isoforms=True) data = get_li2012() for l in data: subs_protein = l[1].split('/')[0] tk_protein = l[2].split()[0] reader_protein = l[3].split()[0] subs_uniprots = mapping.map_name( subs_protein, 'genesymbol', 'uniprot', ) tk_uniprots = mapping.map_name(tk_protein, 'genesymbol', 'uniprot') reader_uniprots = mapping.map_name(reader_protein, 'genesymbol', 'uniprot') subs_resnum = int(non_digit.sub('', l[1].split('/')[1])) for su in subs_uniprots: if su in se: subs_iso = None for iso, s in iteritems(se[su].isof): if se[su].get(subs_resnum, isoform=iso) == 'Y': subs_iso = iso break if subs_iso: start = min(1, subs_resnum - 7) end = max(subs_resnum + 7, len(se[su].isof[subs_iso])) for ku in tk_uniprots: res = intera.Residue(subs_resnum, 'Y', su, isoform=subs_iso) mot = intera.Motif(su, start, end, isoform=subs_iso, instance=se[su].get( start, end, isoform=subs_iso)) ptm = intera.Ptm(su, motif=mot, residue=res, isoform=subs_iso, source='Li2012') dom = intera.Domain(ku) dommot = intera.DomainMotif(domain=dom, ptm=ptm, sources=['Li2012']) result = {} return result
def phosphosite_ptms(organism='human'): """ Downloads the phosphorylation site dataset from PhosphoSitePlus. """ result = [] url = urls.urls['psite_p']['url'] nondigit = re.compile(r'[^\d]+') remot = re.compile(r'(_*)([A-Za-z]+)(_*)') c = curl.Curl(url, silent=False, large=True) data = c.result for _ in xrange(4): null = c.result.readline() for r in data: r = r.split('\t') if len(r) > 9 and (organism is None or r[6] == organism): uniprot = r[2] isoform = 1 if '-' not in uniprot else int(uniprot.split('-')[1]) uniprot = uniprot.split('-')[0] typ = r[3].lower() if len(typ) == 0: typ = r[4].split('-')[1] if '-' in r[4] else None aa = r[4][0] num = int(nondigit.sub('', r[4])) motif = remot.match(r[9]) if motif: start = num - 7 + len(motif.groups()[0]) end = num + 7 - len(motif.groups()[2]) instance = r[9].replace('_', '').upper() else: start = None end = None instance = None res = intera.Residue(num, aa, uniprot, isoform=isoform) mot = intera.Motif(uniprot, start, end, instance=instance, isoform=isoform) ptm = intera.Ptm(uniprot, typ=typ, motif=mot, residue=res, source='PhosphoSite', isoform=isoform) result.append(ptm) return result
def translate_ptm(self, ptm): tptms = self.translate_site( ptm.protein, ptm.residue.name, ptm.residue.number, ptm.residue.isoform, ptm.typ, ) result = [] for x in tptms: se = self.get_seq(x[0]) if (se is None or x[1] not in se.isof) and self.strict: continue res = intera.Residue( number=x[3], name=x[2], protein=x[0], isoform=x[1], ncbi_tax_id=self.target, ) start, end, region = (se.get_region(x[3], isoform=x[1]) if se is not None and x[1] in se.isof else (None, None, None)) mot = intera.Motif( protein=x[0], start=start, end=end, instance=region, isoform=x[1], ncbi_tax_id=self.target, ) ptm = intera.Ptm( protein=x[0], motif=mot, residue=res, typ=x[5], isoform=x[1], evidences=ptm.evidences, ncbi_tax_id=self.target, ) result.append(ptm) return result
def _process(self, p): # human leukocyte antigenes result a result an # extremely high number of combinations if (not p['kinase'] or (isinstance(p['substrate'], common.basestring) and p['substrate'].startswith('HLA'))): return if not isinstance(p['kinase'], list): p['kinase'] = [p['kinase']] kinase_ups = mapping.map_names( p['kinase'], self.id_type_enzyme, 'uniprot', ncbi_tax_id=self.ncbi_tax_id, ) substrate_ups_all = set() for sub_id_type in self.id_type_substrate: if isinstance(sub_id_type, (list, tuple)): sub_id_type, sub_id_attr = sub_id_type else: sub_id_attr = 'substrate' substrate_ups_all.update( set( mapping.map_name( p[sub_id_attr], sub_id_type, 'uniprot', self.ncbi_tax_id, ))) # looking up sequences in all isoforms: substrate_ups = [] for s in substrate_ups_all: if 'substrate_isoform' in p and p['substrate_isoform']: substrate_ups.append((s, p['substrate_isoform'])) else: se = self.get_seq(s) if se is None: continue for isof in se.isoforms(): if 'instance' in p and p['instance'] is not None: if se.match( p['instance'], p['start'], p['end'], isoform=isof, ): substrate_ups.append((s, isof)) else: if se.match( p['resaa'], p['resnum'], isoform=isof, ): substrate_ups.append((s, isof)) if self.trace: if p['substrate'] not in self.sub_ambig: self.sub_ambig[p['substrate']] = substrate_ups for k in p['kinase']: if k not in self.kin_ambig: self.kin_ambig[k] = kinase_ups # generating report on non matching substrates if len(substrate_ups) == 0: for s in substrate_ups_all: se = self.get_seq(s[0]) if se is None: continue self.nomatch.append(( s[0], s[1], ( p['substrate_refseq'] if 'substrate_refseq' in p else '', s, p['instance'], se.get(p['start'], p['end']), ), )) # building objects representing the enzyme-substrate interaction(s) if 'typ' not in p: p['typ'] = 'phosphorylation' _resources = tuple( (self.input_param. get_via(name) if hasattr(self.input_param, 'get_via') else name) for name in (p['databases'] if 'databases' in p else ())) _resources += ((self.name, ) if isinstance( self.input_param, common.basestring) else (self.input_param, )) # collecting the evidences evidences = evidence.Evidences( evidence.Evidence(resource=_res, references=p['references'] if 'references' in p else None) for _res in _resources) for s in substrate_ups: # building the objects representing the substrate se = self.get_seq(s[0]) if se is None: continue res = intera.Residue( p['resnum'], p['resaa'], s[0], isoform=s[1], ncbi_tax_id=self.ncbi_tax_id, ) if 'instance' not in p or p['instance'] is None: reg = se.get_region( p['resnum'], p['start'] if 'start' in p else None, p['end'] if 'end' in p else None, isoform=s[1], ) if reg is not None: p['start'], p['end'], p['instance'] = reg mot = intera.Motif( s[0], p['start'], p['end'], instance=p['instance'], isoform=s[1], ncbi_tax_id=self.ncbi_tax_id, ) ptm = intera.Ptm( s[0], motif=mot, residue=res, typ=p['typ'], evidences=evidences, isoform=s[1], ncbi_tax_id=self.ncbi_tax_id, ) for k in kinase_ups: if (not self.allow_mixed_organisms and (self.get_taxon(k) != self.ncbi_tax_id or self.get_taxon(s[0]) != self.ncbi_tax_id)): continue # the enzyme (kinase) dom = intera.Domain( protein=k, ncbi_tax_id=self.ncbi_tax_id, ) dommot = intera.DomainMotif( domain=dom, ptm=ptm, evidences=evidences, ) if hasattr(self.input_param, 'extra_attrs'): for attr, key in iteritems(self.input_param.extra_attrs): if key in p: setattr(dommot, attr, p[key]) yield dommot
def phosphosite_enzyme_substrate( raw=True, organism='human', strict=True, ): """ Downloads and preprocesses phosphorylation site data from PhosphoSitePlus. """ url = urls.urls['psite_kin']['url'] c = curl.Curl( url, silent=False, compr='gz', encoding='iso-8859-1', large=True, ) orto = {} data = c.result cols = { 'kinase': 2, 'kinase_org': 3, 'substrate': 6, 'substrate_org': 8, 'residue': 9, 'motif': 11 } data = inputs_common.read_table( cols=cols, fileObject=data, sep='\t', hdr=4, ) result = [] non_digit = re.compile(r'[^\d.-]+') motre = re.compile(r'(_*)([A-Za-z]+)(_*)') for r in data: if organism is None or \ ((r['kinase_org'] == organism or not strict) and \ r['substrate_org'] == organism): if r['kinase_org'] != organism: korg = r['kinase_org'] # attempting to map by orthology: if korg in taxonomy.taxa and organism in taxonomy.taxa: ktaxid = taxonomy.taxa[korg] taxid = taxonomy.taxa[organism] if korg not in orto: orto[korg] = homology.homologene_dict( ktaxid, taxid, 'refseqp', ) korg_refseq = mapping.map_name(r['kinase'], 'uniprot', 'refseqp', ktaxid) kin_uniprot = \ list( itertools.chain( *map( lambda ors: mapping.map_name(ors, 'refseqp', 'uniprot', taxid), itertools.chain( *map( lambda rs: orto[korg][rs], filter( lambda rs: rs in orto[korg], korg_refseq ) ) ) ) ) ) else: kin_uniprot = [r['kinase']] for kinase in kin_uniprot: r['resaa'] = r['residue'][0] r['resnum'] = int(non_digit.sub('', r['residue'][1:])) mot = motre.match(r['motif']) # excluding e.g. Q12809_VAR_014388 r['substrate'] = r['substrate'].split('_')[0] sisoform = 1 if '-' not in r['substrate'] else \ int(r['substrate'].split('-')[1]) r['substrate'] = r['substrate'].split('-')[0] kisoform = (1 if '-' not in kinase else int( kinase.split('-')[1])) kinase = kinase.split('-')[0] r['substrate'] = r['substrate'].split('-')[0] if mot: r['start'] = r['resnum'] - 7 + len(mot.groups()[0]) r['end'] = r['resnum'] + 7 - len(mot.groups()[2]) r['instance'] = r['motif'].replace('_', '').upper() else: r['start'] = None r['end'] = None r['instance'] = None if raw: r['kinase'] = kinase result.append(r) else: res = intera.Residue(r['resnum'], r['resaa'], r['substrate'], isoform=sisoform) mot = intera.Motif(r['substrate'], r['start'], r['end'], instance=r['instance'], isoform=sisoform) ptm = intera.Ptm(protein=r['substrate'], residue=res, motif=mot, typ='phosphorylation', source='PhosphoSite', isoform=sisoform) dom = intera.Domain(protein=kinase, isoform=kisoform) dommot = intera.DomainMotif(domain=dom, ptm=ptm, sources=['PhosphoSite']) result.append(dommot) return result