def lit_bm_interactions(): """ Literature collected interactions from Luck 2020. """ LitBmInteraction = collections.namedtuple( 'LitBmInteraction', ['uniprot_a', 'uniprot_b'], ) url = urls.urls['hid']['lit-bm'] c = curl.Curl(url, large=True, silent=False) for row in c.result: row = row.strip().split('\t') uniprots_a = mapping.map_name(row[0], 'ensembl', 'uniprot') uniprots_b = mapping.map_name(row[1], 'ensembl', 'uniprot') for uniprot_a, uniprot_b in itertools.product(uniprots_a, uniprots_b): yield LitBmInteraction( uniprot_a=uniprot_a, uniprot_b=uniprot_b, )
def homologene_uniprot_dict(source, target, only_swissprot=True): """ Returns orthology translation table as dict from UniProt to Uniprot, obtained from NCBI HomoloGene data. Uses RefSeq and Entrez IDs for translation. :param int source: NCBI Taxonomy ID of the source species (keys). :param int target: NCBI Taxonomy ID of the target species (values). :param bool only_swissprot: Translate only SwissProt IDs. """ result = {} hge = homologene_dict(source, target, 'entrez') hgr = homologene_dict(source, target, 'refseq') all_source = set( uniprot_input.all_uniprots(organism=source, swissprot='YES')) if not only_swissprot: all_source_trembl = uniprot_input.all_uniprots(organism=source, swissprot='NO') all_source.update(set(all_source_trembl)) for u in all_source: source_e = mapping.map_name(u, 'uniprot', 'entrez', source) source_r = mapping.map_name(u, 'uniprot', 'refseqp', source) target_u = set([]) target_r = set([]) target_e = set([]) for e in source_e: if e in hge: target_e.update(hge[e]) for r in source_r: if r in hgr: target_r.update(hgr[r]) for e in target_e: target_u.update(mapping.map_name(e, 'entrez', 'uniprot', target)) for r in target_r: target_u.update(mapping.map_name(e, 'refseqp', 'uniprot', target)) target_u = \ itertools.chain( *map( lambda tu: mapping.map_name(tu, 'uniprot', 'uniprot', target), target_u ) ) result[u] = sorted(list(target_u)) return result
def depod_enzyme_substrate(organism=9606): result = [] reunip = re.compile(r'uniprotkb:([A-Z0-9]+)') reptm = re.compile(r'([A-Z][a-z]{2})-([0-9]+)') repmidsep = re.compile(r'[,|]\s?') url = urls.urls['depod']['urls'][0] c = curl.Curl(url, silent=False, encoding='ascii') data = c.result data = [x.split('\t') for x in data.split('\n')] del data[0] url_mitab = urls.urls['depod']['urls'][1] c_mitab = curl.Curl(url_mitab, silent=False, encoding='iso-8859-1') data_mitab = c_mitab.result data_mitab = [x.split('\t') for x in data_mitab.split('\n')] del data_mitab[0] for i, l in enumerate(data): if (len(l) > 6 and l[2] == 'protein substrate' and taxonomy.ensure_ncbi_tax_id(l[3].split('(')[0].strip()) == organism and l[4].strip() != 'N/A'): enzyme_uniprot = reunip.search(data_mitab[i][0]).groups()[0] substrate_uniprot = reunip.search(data_mitab[i][1]).groups()[0] for enzyme_up, substrate_up in itertools.product( mapping.map_name(enzyme_uniprot, 'uniprot', 'uniprot'), mapping.map_name(substrate_uniprot, 'uniprot', 'uniprot'), ): for resaa, resnum in reptm.findall(l[4]): resnum = int(resnum) resaa = (common.aminoa_3_to_1_letter[resaa] if resaa in common.aminoa_3_to_1_letter else resaa) result.append({ 'instance': None, 'kinase': enzyme_up, 'resaa': resaa, 'resnum': resnum, 'references': repmidsep.split(l[6].strip()), 'substrate': substrate_up, 'start': None, 'end': None, 'typ': 'dephosphorylation', }) return result
def homologene_uniprot_dict(self, source): """ Builds orthology translation table as dict from UniProt to Uniprot, obtained from NCBI HomoloGene data. Uses RefSeq and Entrez IDs for translation. """ source = self.get_source(source) self.h**o[source] = {} hge = homologene_dict(source, self.target, 'entrez') hgr = homologene_dict(source, self.target, 'refseq') self.load_proteome(source, self.only_swissprot) for u in self._proteomes[(source, self.only_swissprot)]: source_e = mapping.map_name(u, 'uniprot', 'entrez', source) source_r = mapping.map_name(u, 'uniprot', 'refseqp', source) target_u = set([]) target_r = set([]) target_e = set([]) for e in source_e: if e in hge: target_e.update(hge[e]) for r in source_r: if r in hgr: target_r.update(hgr[r]) for e in target_e: target_u.update( set(mapping.map_name(e, 'entrez', 'uniprot', self.target))) for r in target_r: target_u.update( set(mapping.map_name(e, 'refseqp', 'uniprot', self.target))) target_u = \ itertools.chain( *map( lambda tu: mapping.map_name( tu, 'uniprot', 'uniprot', self.target), target_u ) ) self.h**o[source][u] = sorted(list(target_u))
def build_gene(self): self.cpdb_gene = set() for entity in self._entities: # we add the components of the complexes to the protein data # frame; I don't know if it's necessary but does not harm I guess if hasattr(entity, 'components'): components = entity.components else: components = (entity, ) for comp in components: name = mapping.map_name0(comp, 'uniprot', 'genesymbol') ensembl_genes = mapping.map_name(comp, 'uniprot', 'ensembl') for ensembl in ensembl_genes: self.cpdb_gene.add( CellPhoneDBGene( gene_name=name, uniprot=comp, hgnc_symbol=name, ensembl=ensembl, ))
def cellcellinteractions_annotations(): CellcellinteractionsAnnotation = collections.namedtuple( 'CellcellinteractionsAnnotation', [ 'mainclass', ] ) url = urls.urls['cellcellinteractions']['url'] c = curl.Curl(url, silent = False, large = True) _ = next(c.result) result = collections.defaultdict(set) for row in c.result: row = row.strip('\r\n').split('\t') uniprots = mapping.map_name(row[0], 'genesymbol', 'uniprot') classes = row[1].split('/') for uniprot in uniprots: for cls in classes: result[uniprot].add( CellcellinteractionsAnnotation(mainclass = cls) ) return dict(result)
def _map_ids(_id): return mapping.map_name( _id, _id[:4].lower() if _id[:4] in {'ensp', 'enst'} else 'uniprot', 'uniprot', )
def lrdb_annotations(): result = collections.defaultdict(set) lrdb = lrdb_interactions() for rec in lrdb: for role in ('ligand', 'receptor'): uniprots = mapping.map_name( getattr(rec, '%s_genesymbol' % role), 'genesymbol', 'uniprot', ) for uniprot in uniprots: cell_types = getattr(rec, '%s_cells' % role) or (None, ) for cell_type in cell_types: cell_type = ('T lymphocyte' if cell_type == 'tymphocyte' else cell_type.replace( 'cells', 'cell') if cell_type else None) result[uniprot].add( LrdbAnnotation( role=role, cell_type=cell_type, sources=tuple(sorted(rec.sources)), references=tuple(sorted(rec.references)), )) return dict(result)
def netpath_pathway_annotations(): NetpathPathway = collections.namedtuple( 'NetpathPathway', ['pathway'], ) result = collections.defaultdict(set) url_template = urls.urls['netpath_pw']['url'] url_main = urls.urls['netpath_pw']['mainpage'] c = curl.Curl(url_main, cache = False) cookie = [ h.decode().split(':')[1].split(';')[0].strip() for h in c.resp_headers if h.startswith(b'Set-Cookie') ] cookie_hdr = ['Cookie: %s' % '; '.join(cookie)] pathway_ids = netpath_names() for _id, pathway in iteritems(pathway_ids): url = url_template % int(_id) c = curl.Curl( url, req_headers = cookie_hdr, silent = False, encoding = 'iso-8859-1', ) soup = bs4.BeautifulSoup(c.result, 'html.parser') for tbl in soup.find_all('table'): hdr = tbl.find('td', {'class': 'barhead'}) if not hdr or not hdr.text.strip().startswith('Molecules Invol'): continue for td in tbl.find_all('td'): genesymbol = td.text.strip() if not genesymbol: continue uniprots = mapping.map_name( genesymbol, 'genesymbol', 'uniprot', ) for uniprot in uniprots: result[uniprot].add( NetpathPathway( pathway = pathway ) ) return result
def cspa_cell_types(organism = 9606): sheets = { 'Human': 'Table_E', 'Mouse': 'Table_F', } str_organism = taxonomy.taxids[organism].capitalize() url = urls.urls['cspa']['url_s1'] c = curl.Curl(url, large = True, silent = False) xlsname = c.fname del(c) raw = inputs_common.read_xls(xlsname, sheets[str_organism]) result = collections.defaultdict(lambda: collections.defaultdict(dict)) cell_types = raw[0][1:] for row in raw[1:]: for uniprot in mapping.map_name(row[0], 'uniprot', 'uniprot'): for col, cell_type in enumerate(cell_types): value = row[col + 1] result[cell_type][uniprot] = ( float(value) if common.is_float(value) else None ) return result
def disgenet_annotations(dataset='curated'): """ Downloads and processes the list of all human disease related proteins from DisGeNet. Returns dict of dicts. @dataset : str Name of DisGeNet dataset to be obtained: `curated`, `literature`, `befree` or `all`. """ DisGeNetAnnotation = collections.namedtuple('DisGeNetAnnotation', [ 'disease', 'score', 'dsi', 'dpi', 'nof_pmids', 'nof_snps', 'source', ]) url = urls.urls['disgenet']['url'] % dataset c = curl.Curl( url, silent=False, large=True, encoding='utf-8', default_mode='r', ) reader = csv.DictReader(c.result, delimiter='\t') data = collections.defaultdict(set) for rec in reader: uniprots = mapping.map_name( rec['geneSymbol'], 'genesymbol', 'uniprot', ) if not uniprots: continue for uniprot in uniprots: data[uniprot].add( DisGeNetAnnotation( disease=rec['diseaseName'], score=float(rec['score']), dsi=float(rec['DSI']) if rec['DSI'] else None, dpi=float(rec['DPI']) if rec['DPI'] else None, nof_pmids=int(rec['NofPmids']), nof_snps=int(rec['NofSnps']), source=tuple(x.strip() for x in rec['source'].split(';')), )) return data
def kea_interactions(): KeaRecord = collections.namedtuple('KeaRecord', [ 'enzyme', 'substrate', 'residue_type', 'residue_offset', 'pmid', 'resource', ]) resub = re.compile(r'(\w+)_([A-Z])([0-9]+)') url = urls.urls['kea']['kinase_substrate'] c = curl.Curl(url, silent=False, large=True) result = [] for rec in c.result: rec = rec.strip().split('\t') site = resub.match(rec[1].strip()) if not site: continue target, resaa, resnum = site.groups() e_uniprots = mapping.map_name(rec[0], 'genesymbol', 'uniprot') s_uniprots = mapping.map_name(target, 'genesymbol', 'uniprot') for enz, sub in itertools.product(e_uniprots, s_uniprots): result.append( KeaRecord(enzyme=enz, substrate=sub, residue_type=resaa, residue_offset=int(resnum), pmid=rec[2].strip(), resource=_resources[rec[3].strip()])) return result
def almen2009_annotations(): resep = re.compile(r'[;/]') Almen2009Annotation = collections.namedtuple( 'Almen2009Annotation', [ 'mainclass', 'classes', 'phobius_secreted', 'phobius_transmembrane', 'sosui_transmembrane', 'tmhmm_transmembrane', ] ) url = urls.urls['almen2009']['url'] c = curl.Curl(url, silent = False, large = True) xls = c.fileobj xlsfile = xls.name xls.close() tbl = inputs_common.read_xls(xlsfile, sheet = 'Data')[1:] result = collections.defaultdict(set) for row in tbl: uniprots = mapping.map_name(row[0], 'ipi', 'uniprot') mainclass = row[2] classes = row[3].replace('KInase', 'Kinase') classes = tuple(sorted(resep.split(classes))) phobius_transmembrane = int(float(row[5])) phobius_secreted = row[6] == 'Y' sosui_transmembrane = int(float(row[8])) if row[8] != 'ERROR' else 0 tmhmm_transmembrane = int(float(row[10])) for uniprot in uniprots: result[uniprot].add( Almen2009Annotation( mainclass = mainclass, classes = classes, phobius_secreted = phobius_secreted, phobius_transmembrane = phobius_transmembrane, sosui_transmembrane = sosui_transmembrane, tmhmm_transmembrane = tmhmm_transmembrane, ) ) return result
def process_name(name): return ( (complexes_by_name[name],) if name in complexes_by_name else mapping.map_name( name, 'genesymbol', 'uniprot', ncbi_tax_id = ncbi_tax_id, ) )
def signalink_annotations(organism = 9606): SignalinkPathway = collections.namedtuple( 'SignalinkPathway', [ 'pathway', ] ) SignalinkFunction = collections.namedtuple( 'SignalinkFunction', [ 'function', ] ) result = { 'pathway': collections.defaultdict(set), 'function': collections.defaultdict(set), } interactions = signalink_interactions(organism = organism) for i in interactions: for postfix in ('_a', '_b'): _id = getattr(i, 'id%s' % postfix) for uniprot in mapping.map_name(_id, 'uniprot', 'uniprot'): for attr, record in zip( ('pathway', 'function'), (SignalinkPathway, SignalinkFunction), ): values = getattr(i, '%ss%s' % (attr, postfix)) for value in values: result[attr][uniprot].add( record(value) ) return result
def cspa_annotations(organism = 9606): CspaAnnotation = collections.namedtuple( 'CspaAnnotation', [ 'high_confidence', 'n_cell_types', 'tm', 'gpi', 'uniprot_cell_surface', ], ) sheets = { 'Human': 'Table A', 'Mouse': 'Table B', } str_organism = taxonomy.taxids[organism].capitalize() url = urls.urls['cspa']['url_s2'] c = curl.Curl(url, large = True, silent = False) xlsname = c.fname del(c) raw = inputs_common.read_xls(xlsname, sheets[str_organism])[1:] result = collections.defaultdict(set) for row in raw: for uniprot in mapping.map_name(row[1], 'uniprot', 'uniprot'): result[uniprot].add( CspaAnnotation( high_confidence = 'high confidence' in row[2], n_cell_types = int(float(row[9])), tm = int(float(row[11])), gpi = int(float(row[12])), uniprot_cell_surface = row[13] == 'yes', ) ) return dict(result)
def _embrace_id_translation(mouse_genesymbol, organism=9606): uniprots = mapping.map_name( mouse_genesymbol, 'genesymbol', 'uniprot', ncbi_tax_id=10090, ) if organism != 10090: uniprots = homology.translate( uniprots, target=organism, source=10090, ) return uniprots or [None]
def surfaceome_annotations(): """ Downloads the "In silico human surfaceome". Dict with UniProt IDs as key and tuples of surface prediction score, class and subclass as values (columns B, N, S and T of table S3). """ url = urls.urls['surfaceome']['url'] c = curl.Curl(url, large=True, silent=False) xlsname = c.fname del (c) raw = inputs_common.read_xls(xlsname, 'in silico surfaceome only')[2:] return dict(( uniprot, # uniprot ( float(r[13]), # score r[18] if r[18] else None, # class set(r[19].replace('KInase', 'Kinase').split(';') ) if r[19] else set(), # subclass )) for r in raw for uniprot in mapping.map_name(r[1], 'uniprot', 'uniprot'))
def cancersea_annotations(): """ Retrieves genes annotated with cancer funcitonal states from the CancerSEA database. """ CancerseaAnnotation = collections.namedtuple( 'CancerseaAnnotation', [ 'state', ], ) annotations = collections.defaultdict(set) url = urls.urls['cancersea']['url'] c = curl.Curl(url, silent=False, large=False) soup = bs4.BeautifulSoup(c.result, 'html.parser') for row in soup.find_all('tbody')[1].find_all('tr'): state = row.find_all('td')[0].text url_end = row.find_all('td')[-1].find('a').attrs['href'] data_url = urls.urls['cancersea']['data_url'] % url_end c = curl.Curl(data_url, silent=False, large=True) _ = next(c.result) for line in c.result: line = line.strip().split('\t') uniprots = mapping.map_name(line[1], 'genesymbol', 'uniprot') for uniprot in uniprots: annotations[uniprot].add(CancerseaAnnotation(state=state)) return dict(annotations)