def homologene_uniprot_dict(self, source): """ Builds orthology translation table as dict from UniProt to Uniprot, obtained from NCBI HomoloGene data. Uses RefSeq and Entrez IDs for translation. """ source = self.get_source(source) self.h**o[source] = {} hge = dataio.homologene_dict(source, self.target, 'entrez') hgr = dataio.homologene_dict(source, self.target, 'refseq') self.load_proteome(source, self.only_swissprot) for u in self._proteomes[(source, self.only_swissprot)]: source_e = mapping.map_name(u, 'uniprot', 'entrez', source) source_r = mapping.map_name(u, 'uniprot', 'refseqp', source) target_u = set([]) target_r = set([]) target_e = set([]) for e in source_e: if e in hge: target_e.update(hge[e]) for r in source_r: if r in hgr: target_r.update(hgr[r]) for e in target_e: target_u.update( set(mapping.map_name(e, 'entrez', 'uniprot', self.target))) for r in target_r: target_u.update( set(mapping.map_name(e, 'refseqp', 'uniprot', self.target))) target_u = \ itertools.chain( *map( lambda tu: mapping.map_name( tu, 'uniprot', 'uniprot', self.target), target_u ) ) self.h**o[source][u] = sorted(list(target_u))
def build_gene(self): self.gene = set() for entity in self._entities: # we add the components of the complexes to the protein data # frame; I don't know if it's necessary but does not harm I guess if hasattr(entity, 'components'): components = entity.components else: components = (entity, ) for comp in components: name = mapping.map_name0(comp, 'uniprot', 'genesymbol') ensembl_genes = mapping.map_name(comp, 'uniprot', 'ensembl') for ensembl in ensembl_genes: self.gene.add( CellPhoneDBGene( gene_name=name, uniprot=comp, hgnc_symbol=name, ensembl=ensembl, ))
def _process(self, p): # human leukocyte antigenes result a result an # extremely high number of combinations if (not p['kinase'] or (isinstance(p['substrate'], common.basestring) and p['substrate'].startswith('HLA'))): return if not isinstance(p['kinase'], list): p['kinase'] = [p['kinase']] kinase_ups = mapping.map_names( p['kinase'], self.enzyme_id_type, 'uniprot', ncbi_tax_id=self.ncbi_tax_id, ) substrate_ups_all = set([]) for sub_id_type in (self.substrate_id_types[self.input_method.lower()] if self.input_is(self.substrate_id_types, '__contains__') else [self.substrate_id_type]): if type(sub_id_type) is tuple: sub_id_type, sub_id_attr = sub_id_type else: sub_id_attr = 'substrate' substrate_ups_all.update( set( mapping.map_name( p[sub_id_attr], sub_id_type, 'uniprot', self.ncbi_tax_id, ))) # looking up sequences in all isoforms: substrate_ups = [] for s in substrate_ups_all: if 'substrate_isoform' in p and p['substrate_isoform']: substrate_ups.append((s, p['substrate_isoform'])) else: se = self.get_seq(s) if se is None: continue for isof in se.isoforms(): if p['instance'] is not None: if se.match(p['instance'], p['start'], p['end'], isoform=isof): substrate_ups.append((s, isof)) else: if se.match(p['resaa'], p['resnum'], isoform=isof): substrate_ups.append((s, isof)) if self.trace: if p['substrate'] not in self.sub_ambig: self.sub_ambig[p['substrate']] = substrate_ups for k in p['kinase']: if k not in self.kin_ambig: self.kin_ambig[k] = kinase_ups # generating report on non matching substrates if len(substrate_ups) == 0: for s in substrate_ups_all: se = self.get_seq(s[0]) if se is None: continue nomatch.append( (s[0], s[1], ((p['substrate_refseq'] if 'substrate_refseq' in p else ''), s, p['instance'], se.get(p['start'], p['end'])))) # adding kinase-substrate interactions for k in kinase_ups: for s in substrate_ups: if (not self.allow_mixed_organisms and (self.get_taxon(k) != self.ncbi_tax_id or self.get_taxon(s[0]) != self.ncbi_tax_id)): continue se = self.get_seq(s[0]) if se is None: continue res = intera.Residue(p['resnum'], p['resaa'], s[0], isoform=s[1]) if p['instance'] is None: reg = se.get_region(p['resnum'], p['start'], p['end'], isoform=s[1]) if reg is not None: p['instance'] = reg[2] p['start'] = reg[0] p['end'] = reg[1] if 'typ' not in p: p['typ'] = 'phosphorylation' mot = intera.Motif(s[0], p['start'], p['end'], instance=p['instance'], isoform=s[1]) ptm = intera.Ptm(s[0], motif=mot, residue=res, typ=p['typ'], source=[self.name], isoform=s[1]) dom = intera.Domain(protein=k) if 'references' not in p: p['references'] = [] dommot = intera.DomainMotif(domain=dom, ptm=ptm, sources=[self.name], refs=p['references']) if self.input_is('mimp'): dommot.mimp_sources = ';'.split(p['databases']) dommot.npmid = p['npmid'] elif self.input_is('phosphonetworks'): dommot.pnetw_score = p['score'] elif self.input_is('dbptm'): dommot.dbptm_sources = [p['source']] yield dommot