def depod_enzyme_substrate(organism=9606): result = [] reunip = re.compile(r'uniprotkb:([A-Z0-9]+)') reptm = re.compile(r'([A-Z][a-z]{2})-([0-9]+)') repmidsep = re.compile(r'[,|]\s?') url = urls.urls['depod']['urls'][0] c = curl.Curl(url, silent=False, encoding='ascii') data = c.result data = [x.split('\t') for x in data.split('\n')] del data[0] url_mitab = urls.urls['depod']['urls'][1] c_mitab = curl.Curl(url_mitab, silent=False, encoding='iso-8859-1') data_mitab = c_mitab.result data_mitab = [x.split('\t') for x in data_mitab.split('\n')] del data_mitab[0] for i, l in enumerate(data): if (len(l) > 6 and l[2] == 'protein substrate' and taxonomy.ensure_ncbi_tax_id(l[3].split('(')[0].strip()) == organism and l[4].strip() != 'N/A'): enzyme_uniprot = reunip.search(data_mitab[i][0]).groups()[0] substrate_uniprot = reunip.search(data_mitab[i][1]).groups()[0] for enzyme_up, substrate_up in itertools.product( mapping.map_name(enzyme_uniprot, 'uniprot', 'uniprot'), mapping.map_name(substrate_uniprot, 'uniprot', 'uniprot'), ): for resaa, resnum in reptm.findall(l[4]): resnum = int(resnum) resaa = (common.aminoa_3_to_1_letter[resaa] if resaa in common.aminoa_3_to_1_letter else resaa) result.append({ 'instance': None, 'kinase': enzyme_up, 'resaa': resaa, 'resnum': resnum, 'references': repmidsep.split(l[6].strip()), 'substrate': substrate_up, 'start': None, 'end': None, 'typ': 'dephosphorylation', }) return result
def celltalkdb_annotations(organism=9606): """ Retrieves annotation of protein ligand and receptor roles from CellTalkDB http://tcm.zju.edu.cn/celltalkdb/index.php :param int,str organism: Human and mouse supported, in case of incomprehensible value will fall back to human. :return: Dictionary of annotations with UniProt IDs as keys. """ CellTalkDBAnnotation = collections.namedtuple('CellTalkDBAnnotation', [ 'role', 'pmid', ]) ncbi_tax_id = taxonomy.ensure_ncbi_tax_id(organism) ncbi_tax_id = ncbi_tax_id if ncbi_tax_id in {9606, 10090} else 9606 annot = collections.defaultdict(set) for rec in celltalkdb_download(organism=ncbi_tax_id): for role in ('ligand', 'receptor'): uniprots = mapping.map_name( getattr(rec, '%s_gene_symbol' % role), 'genesymbol', 'uniprot', ncbi_tax_id=ncbi_tax_id, ) for uniprot in uniprots: annot[uniprot].add( CellTalkDBAnnotation( role=role, pmid=rec.evidence, )) return annot
def hippie_interactions( score_threshold=.75, only_human=False, only_sources=None, only_methods=None, methods=False, sources=False, references=True, organisms=False, ): only_sources = common.to_set(only_sources) only_methods = common.to_set(only_methods) HippieInteraction = collections.namedtuple( 'HippieInteraction', [ 'id_a', 'id_b', 'score', 'methods', 'references', 'sources', 'organisms', ], ) tps = lambda i: tuple(sorted(i)) url = urls.urls['hippie']['url'] c = curl.Curl(url, large=True, silent=False) result = set() for i, l in enumerate(c.result): l = l.strip('\r\n').split('\t') score = float(l[4]) if score < score_threshold: continue ids_a_1 = mapping.map_name(l[0], 'uniprot-entry', 'uniprot') ids_a_2 = mapping.map_name(l[1], 'entrez', 'uniprot') ids_b_1 = mapping.map_name(l[2], 'uniprot-entry', 'uniprot') ids_b_2 = mapping.map_name(l[3], 'entrez', 'uniprot') for id_a, id_b in itertools.product(ids_a_1 | ids_a_2, ids_b_1 | ids_b_2): details = dict(( dd[0], set(dd[1].split(',')), ) for dd in (d.split(':') for d in l[5].split(';'))) _sources = details['sources'] if 'sources' in details else set() experiments = (details['experiments'] if 'experiments' in details else set()) if not all(( not only_methods or experiments & only_methods, not only_methods or _sources & only_sources, )): continue _organisms = {9606} if 'species' in details: names = { spec.split('(')[0].strip() for spec in details['species'] } _organisms = { taxonomy.ensure_ncbi_tax_id(name) for name in names } _organisms.discard(None) if only_human and 9606 not in _organisms: continue result.add( HippieInteraction( id_a=id_a, id_b=id_b, score=score, methods=tps(experiments) if methods else None, references=(tps(details['pmids']) if references else None), sources=tps(_sources) if sources else None, organisms=tps(_organisms) if organisms else None, )) return list(result)
def iptmnet_interactions(organism=9606): ptm_url = urls.urls['iptmnet']['ptms'] score_url = urls.urls['iptmnet']['scores'] c = curl.Curl(score_url, large=True, silent=False) scores = {} for line in c.result: line = line.strip('\n\r').split('\t') if not line[2]: continue site = resite.match(line[1]) if not site: continue resaa, resnum = site.groups() resnum = int(resnum) score = int(line[4]) substrate, isoform = inputs_common._try_isoform(line[0]) enzyme = line[2] key = ( enzyme, substrate, isoform, line[3].lower(), # PTM type resaa, resnum, ) scores[key] = score c = curl.Curl(ptm_url, large=True, silent=False) for line in c.result: line = line.strip('\n\r').split('\t') if not line or not line[6]: continue ncbi_tax_id = taxonomy.ensure_ncbi_tax_id(line[4].strip()) if organism and ncbi_tax_id != organism: continue substrate, s_isoform = inputs_common._try_isoform(line[2]) ptm_type = line[0].lower() enzyme, e_isoform = inputs_common._try_isoform(line[6]) enzyme_ids = (mapping.map_name( line[6], 'pro', 'uniprot', ncbi_tax_id=organism, ) if line[6].startswith('PR:') else (enzyme, )) refs = line[9].split(',') resnum, resaa = resite.match(line[5]).groups() key = ( line[6], substrate, isoform, ptm_type, resaa, resnum, ) score = scores[key] if key in scores else None for _enzyme in enzyme_ids: yield IptmnetInteraction( enzyme=_enzyme, substrate=substrate, enzyme_isoform=e_isoform, substrate_isoform=s_isoform, ptm_type=ptm_type, resaa=resaa, resnum=resnum, score=score, references=refs, )
def get_pfam(uniprots=None, organism=9606): if uniprots is None: uniprots = uniprot_input.all_uniprots( organism=organism, swissprot=True, ) u_pfam = {} pfam_u = {} if uniprots is not None: prg = progress.Progress( len(uniprots) / 30, 'Downloading data from UniProt', 1, ) data_all = [] for i in xrange(0, len(uniprots), 30): to = i + 30 thisPart = uniprots[i:to] thisPart = ' OR '.join(['accession:%s' % u for u in thisPart]) get = { 'query': thisPart, 'format': 'tab', 'columns': 'id,database(Pfam)' } for j in xrange(3): c = curl.Curl(urls.urls['uniprot_basic']['url'], get=get) data = c.result if data is not None: break if data is None: return None, None data = data.split('\n') del data[0] del data[-1] data_all += data prg.step() prg.terminate() else: organism = taxonomy.ensure_ncbi_tax_id(organism) if not organism: return None, None organismQuery = 'organism:%u AND reviewed:yes' % organism get = { 'query': organismQuery, 'format': 'tab', 'columns': 'id,database(Pfam)' } for j in xrange(3): c = curl.Curl( urls.urls['uniprot_basic']['url'], get=get, silent=False, outf='uniprot-pfam-%u.tab' % organism, ) data_all = c.result if data_all is not None: break if data_all is None: return None data_all = data_all.split('\n') del data_all[0] for l in data_all: l = l.split('\t') pfams = re.sub(';$', '', l[1]).strip() pfams = pfams.split(';') if pfams else [] if l[0] not in u_pfam: u_pfam[l[0]] = [] u_pfam[l[0]] += pfams for pfam in pfams: if pfam not in pfam_u: pfam_u[pfam] = [] pfam_u[pfam].append(l[0]) return u_pfam, pfam_u
def gpcrdb_annotations(organism=9606): """ :param int,str organism: Only human and mouse (9606 and 10090) are supported. """ GpcrdbAnnotation = collections.namedtuple('GpcrdbAnnotation', [ 'gpcr_class', 'family', 'subfamily', ]) organism = taxonomy.ensure_ncbi_tax_id(organism) if organism not in (9606, 10090): return {} i_uniprot = 31 if organism == 10090 else 15 url = urls.urls['gpcrdb']['families'] c = curl.Curl(url, silent=False, large=True) result = collections.defaultdict(set) for line in c.result: if line[0] != ' ': cls = line.split('|')[0].strip() family = None subfamily = None elif line[4] != ' ': family = line.strip() subfamily = None elif line[8] != ' ': subfamily = line.strip() else: line = line.strip().strip('"') if line.startswith('gpcr'): line = line.split('","') uniprot = line[i_uniprot] if uniprot: result[uniprot].add( GpcrdbAnnotation( gpcr_class=cls, family=family, subfamily=subfamily, )) return dict(result)
def _cellchatdb_organism(organism = 9606): ncbi_tax_id = taxonomy.ensure_ncbi_tax_id(organism) ncbi_tax_id = 10090 if ncbi_tax_id == 10090 else 9606 return ncbi_tax_id
def __init__( self, components, ncbi_tax_id=9606, name=None, ids=None, sources=None, interactions=None, references=None, proteins=None, attrs=None, ): """ Represents a molecular complex. components : list,dict Either a list of identifiers or a dict with identifiers as keys and stoichiometric coefficients as values. List of identifiers also assumed to represent stoichiometry by repetition of identifiers. ncbi_tax_id : int NCBI taxonomy identifier of the complex. It implies all members of the complex belong to the same organism. Support for multi- organism complexes will be implemented in the future. name : str A custom name or identifier of the complex. ids : dict Identifiers. If ``sources`` is a set, list or tuple it should be a dict with database names as keys and set of identifiers as values. If ``sources`` is a string, it can be a set of identifiers or a single identifier. sources : set,str Database(s) the complex has been defined in. interactions : list,dict Interactions between the components of the complex. Either a list of tuples of component IDs or a dict with tuples as keys and custom interaction properties as values. proteins : list,dict Synonym for `components`, kept for compatibility. """ components = components or proteins if not isinstance(components, dict): self.components = dict(collections.Counter(components)) else: self.components = components self.proteins = self.components self.name = name self.ids = collections.defaultdict(set) self.add_ids(ids, source=sources) self.sources = common.to_set(sources) self.references = common.to_set(references) self.ncbi_tax_id = taxonomy.ensure_ncbi_tax_id(ncbi_tax_id) self.attrs = {} if isinstance(attrs, dict): self.attrs.update(attrs) self.interactions = interactions