def netpath_pathway_annotations(): NetpathPathway = collections.namedtuple( 'NetpathPathway', ['pathway'], ) result = collections.defaultdict(set) url_template = urls.urls['netpath_pw']['url'] url_main = urls.urls['netpath_pw']['mainpage'] c = curl.Curl(url_main, cache = False) cookie = [ h.decode().split(':')[1].split(';')[0].strip() for h in c.resp_headers if h.startswith(b'Set-Cookie') ] cookie_hdr = ['Cookie: %s' % '; '.join(cookie)] pathway_ids = netpath_names() for _id, pathway in iteritems(pathway_ids): url = url_template % int(_id) c = curl.Curl( url, req_headers = cookie_hdr, silent = False, encoding = 'iso-8859-1', ) soup = bs4.BeautifulSoup(c.result, 'html.parser') for tbl in soup.find_all('table'): hdr = tbl.find('td', {'class': 'barhead'}) if not hdr or not hdr.text.strip().startswith('Molecules Invol'): continue for td in tbl.find_all('td'): genesymbol = td.text.strip() if not genesymbol: continue uniprots = mapping.map_name( genesymbol, 'genesymbol', 'uniprot', ) for uniprot in uniprots: result[uniprot].add( NetpathPathway( pathway = pathway ) ) return result
def depod_enzyme_substrate(organism=9606): result = [] reunip = re.compile(r'uniprotkb:([A-Z0-9]+)') reptm = re.compile(r'([A-Z][a-z]{2})-([0-9]+)') repmidsep = re.compile(r'[,|]\s?') url = urls.urls['depod']['urls'][0] c = curl.Curl(url, silent=False, encoding='ascii') data = c.result data = [x.split('\t') for x in data.split('\n')] del data[0] url_mitab = urls.urls['depod']['urls'][1] c_mitab = curl.Curl(url_mitab, silent=False, encoding='iso-8859-1') data_mitab = c_mitab.result data_mitab = [x.split('\t') for x in data_mitab.split('\n')] del data_mitab[0] for i, l in enumerate(data): if (len(l) > 6 and l[2] == 'protein substrate' and taxonomy.ensure_ncbi_tax_id(l[3].split('(')[0].strip()) == organism and l[4].strip() != 'N/A'): enzyme_uniprot = reunip.search(data_mitab[i][0]).groups()[0] substrate_uniprot = reunip.search(data_mitab[i][1]).groups()[0] for enzyme_up, substrate_up in itertools.product( mapping.map_name(enzyme_uniprot, 'uniprot', 'uniprot'), mapping.map_name(substrate_uniprot, 'uniprot', 'uniprot'), ): for resaa, resnum in reptm.findall(l[4]): resnum = int(resnum) resaa = (common.aminoa_3_to_1_letter[resaa] if resaa in common.aminoa_3_to_1_letter else resaa) result.append({ 'instance': None, 'kinase': enzyme_up, 'resaa': resaa, 'resnum': resnum, 'references': repmidsep.split(l[6].strip()), 'substrate': substrate_up, 'start': None, 'end': None, 'typ': 'dephosphorylation', }) return result
def genecards_datasheet(gene): """ Retrieves a gene (protein) datasheet from GeneCards. Returns HTML as string. :param str gene: A Gene Symbol or UniProt ID. """ url = urls.urls['genecards']['url'] % gene c = curl.Curl( url, silent = True, large = False, connect_timeout = settings.get('genecards_datasheet_connect_timeout'), timeout = settings.get('genecards_datasheet_timeout'), ) if c.status not in {0, 200}: _log('Failed to retrieve gene card for ID `%s`.' % gene) return None return c.result
def get_isoforms(organism=9606): """ Loads UniProt sequences for all isoforms. """ if organism in taxonomy.phosphoelm_taxids: organism = taxonomy.phosphoelm_taxids[organism] reorg = re.compile(r'OS=([A-Z][a-z]+\s[a-z]+)') result = {} url = urls.urls['unip_iso']['url'] c = curl.Curl(url, silent=False) data = c.result data = read_fasta(data) for header, seq in iteritems(data): org = reorg.findall(header) if len(org) > 0 and org[0] == organism: prot = header.split('|')[1].split('-') unip = prot[0] isof = int(prot[1]) if unip not in result: result[unip] = {} result[unip][isof] = seq return result
def cspa_cell_types(organism = 9606): sheets = { 'Human': 'Table_E', 'Mouse': 'Table_F', } str_organism = taxonomy.taxids[organism].capitalize() url = urls.urls['cspa']['url_s1'] c = curl.Curl(url, large = True, silent = False) xlsname = c.fname del(c) raw = inputs_common.read_xls(xlsname, sheets[str_organism]) result = collections.defaultdict(lambda: collections.defaultdict(dict)) cell_types = raw[0][1:] for row in raw[1:]: for uniprot in mapping.map_name(row[0], 'uniprot', 'uniprot'): for col, cell_type in enumerate(cell_types): value = row[col + 1] result[cell_type][uniprot] = ( float(value) if common.is_float(value) else None ) return result
def cellcellinteractions_annotations(): CellcellinteractionsAnnotation = collections.namedtuple( 'CellcellinteractionsAnnotation', [ 'mainclass', ] ) url = urls.urls['cellcellinteractions']['url'] c = curl.Curl(url, silent = False, large = True) _ = next(c.result) result = collections.defaultdict(set) for row in c.result: row = row.strip('\r\n').split('\t') uniprots = mapping.map_name(row[0], 'genesymbol', 'uniprot') classes = row[1].split('/') for uniprot in uniprots: for cls in classes: result[uniprot].add( CellcellinteractionsAnnotation(mainclass = cls) ) return dict(result)
def get_dorothea_old(levels={'A', 'B'}, only_curated=False): """ Retrieves TF-target interactions from DoRothEA. :param set levels: Confidence levels to be used. :param bool only_curated: Retrieve only literature curated interactions. Details ------- DoRothEA is a comprehensive resource of TF-target interactions combining multiple lines of evidences: literature curated databases, ChIP-Seq data, PWM based prediction using HOCOMOCO and JASPAR matrices and prediction from GTEx expression data by ARACNe. For details see https://github.com/saezlab/DoRothEA. """ url = urls.urls['dorothea']['url'] % ( 'all' if 'E' in levels else 'ABCD' if 'D' in levels else 'ABC' if 'C' in levels else 'AB' if 'B' in levels else 'A') c = curl.Curl(url, silent=False, large=True) _ = next(c.result) return (list( itertools.chain(ll[:4], (s == 'TRUE' for s in ll[4:8]), ll[-4:], [','.join(s for s in ll[-4:] if s)] if not only_curated else ll[8])) for ll in (l.strip('\n\r').split('\t') for l in c.result) if (ll[3] in levels and not only_curated or ll[4] == 'TRUE'))
def ipi_uniprot(): """ Retrieves an IPI-UniProt mapping dictionary. """ result = collections.defaultdict(set) url = urls.urls['ipi']['url'] c = curl.Curl(url, large=True, silent=False) for row in c.result: row = row.strip('\n\r').split('\t') if len(row) < 3: continue ipi_id = row[2] uniprot, isoform = inputs_common._try_isoform(row[1]) is_uniprot = (not any( uniprot.startswith(pref) for pref in ('NP_', 'OTTH', 'HIT', 'ENSP', 'XP_'))) if is_uniprot: result[ipi_id].add(uniprot) return dict(result)
def lit_bm_interactions(): """ Literature collected interactions from Luck 2020. """ LitBmInteraction = collections.namedtuple( 'LitBmInteraction', ['uniprot_a', 'uniprot_b'], ) url = urls.urls['hid']['lit-bm'] c = curl.Curl(url, large=True, silent=False) for row in c.result: row = row.strip().split('\t') uniprots_a = mapping.map_name(row[0], 'ensembl', 'uniprot') uniprots_b = mapping.map_name(row[1], 'ensembl', 'uniprot') for uniprot_a, uniprot_b in itertools.product(uniprots_a, uniprots_b): yield LitBmInteraction( uniprot_a=uniprot_a, uniprot_b=uniprot_b, )
def adhesome_annotations(): AdhesomeAnnotation = collections.namedtuple( 'AdhesomeAnnotation', ['mainclass', 'intrinsic'], ) result = collections.defaultdict(set) url = urls.urls['adhesome']['components'] c = curl.Curl(url, large = True, silent = False) data = csv.DictReader(c.result, delimiter = ',') for rec in data: uniprots = rec['Swiss-Prot ID'] for uniprot in uniprots.split(','): uniprot = uniprot.strip() if uniprot == 'null': continue for _uniprot in mapping.map_name(uniprot, 'uniprot', 'uniprot'): result[uniprot].add(AdhesomeAnnotation( mainclass = ( common.upper0(rec['Functional Category'].strip()) ), intrinsic = rec['FA'].strip() == 'Intrinsic Proteins', )) return result
def get_pfam_names(): c = curl.Curl(urls.urls['pfam_pdb']['url'], silent=False) data = c.result if data is None: return None, None dname_pfam = {} pfam_dname = {} data = data.replace('\r', '').split('\n') del data[0] for l in data: l = l.split('\t') if len(l) > 5: pfam = l[4].split('.')[0] name = l[5] if pfam not in pfam_dname: pfam_dname[pfam] = [] if name not in dname_pfam: dname_pfam[name] = [] pfam_dname[pfam].append(name) dname_pfam[name].append(pfam) for k, v in iteritems(pfam_dname): pfam_dname[k] = list(set(v)) for k, v in iteritems(dname_pfam): dname_pfam[k] = list(set(v)) return dname_pfam, pfam_dname
def dbptm_enzyme_substrate(organism=9606): """ Downloads enzyme-substrate interactions from dbPTM. Returns list of dicts. """ if organism is None: _organism = None elif organism in taxonomy.dbptm_taxids: _organism = taxonomy.dbptm_taxids[organism] else: sys.stdout.write('\t:: Unknown organism: `%u`.\n' % organism) return [] url = urls.urls['dbptm']['old_table'] c = curl.Curl(url, silent=False, large=True) data = [] hdr = next(c.result).strip().split('\t') for l in c.result: l = l.strip().split('\t') data.append( dict((key, (None if val == '' else val.split(';') if key in {'references', 'kinase'} else int(val) if val.isdigit( ) else val)) for key, val in zip(hdr, l))) return data
def query(self, api, param, silent=False, large=False): ''' Retrieves data from the API. @api : str Shold be one of the 10 API sections available. @param : tuple Tuple of the parameters according to the API. @large : bool Passed to the curl wrapper function. If True, the file will be written to disk, and a file object open for reading is returned; if False, the raw data will be returned, in case of JSON, converted to python object, in case of XML, as a string. ''' url = self.urls[api] % param # long timeout is given, because huge files (hundreds MB) take time to # load c = curl.Curl( url, req_headers=self.auth, silent=silent, timeout=1200, large=large) data = c.fileobj self.tmp = c if self.output_format == 'json' and not large: self.result = self.get_json(c.result) else: self.result = c.fileobj
def get_uniprot_sec(organism=9606): """ Downloads and processes the mapping between secondary and primary UniProt IDs. Yields pairs of secondary and primary UniProt IDs. :param int organism: NCBI Taxonomy ID of the organism. """ if organism is not None: proteome = all_uniprots(organism=organism) proteome = set(proteome) sec_pri = [] url = urls.urls['uniprot_sec']['url'] c = curl.Curl(url, silent=False, large=True, timeout=2400) for line in filter( lambda line: len(line) == 2 and (organism is None or line[1] in proteome), map(lambda i: i[1].split(), filter(lambda i: i[0] >= 30, enumerate(c.result)))): yield line
def lit_bm_13_interactions(): """ Downloads and processes Lit-BM-13 dataset, the 2013 version of the high confidence literature curated interactions from CCSB. Returns list of interactions. """ LitBm13Interaction = collections.namedtuple('LitBm13Interaction', [ 'entrez_a', 'entrez_b', 'genesymbol_a', 'genesymbol_b', ]) url = urls.urls['hid']['lit-bm-13'] c = curl.Curl(url, silent=False, large=True) _ = next(c.result) for row in c.result: row = row.strip().split('\t') yield LitBm13Interaction( entrez_a=row[0], entrez_b=row[2], genesymbol_a=row[1], genesymbol_b=row[3], )
def uniprot_history(identifier): """ Retrieves the history of a record. Returns a generator iterating over the history from most recent to the oldest. """ if valid_uniprot(identifier): url_history = urls.urls['uniprot_basic']['history'] % identifier c_history = curl.Curl( url_history, silent=True, large=True, ) if c_history.result: line0 = next(c_history.result) if not line0.startswith('<!DOCTYPE'): for line in c_history.result: if line: yield UniprotRecordHistory( *(field.strip() for field in line.split('\t')))
def _uniprot_deleted(swissprot=True, confirm=True): if not swissprot and confirm: resp = input('Loading the list of deleted TrEMBL IDs requires ' '>5GB memory. Do you want to proceed [y/n] ') if not resp or resp[0].lower() != 'y': return set() key = 'deleted_%s' % ('sp' if swissprot else 'tr') url = urls.urls['uniprot_basic'][key] c = curl.Curl(url, silent=False, large=True) result = set() for line in c.result: m = reac.match(line.strip()) if m: result.add(m.groups()[0]) return result
def _matrixdb_protein_list(category, organism=9606): """ Returns a set of proteins annotated by MatrixDB. :arg str category: The protein annotation category. Possible values: `ecm`, `membrane` or `secreted`. """ url = urls.urls['matrixdb']['%s_proteins' % category] c = curl.Curl(url, silent=False, large=True) proteins = set() # header row _ = next(c.result) for l in c.result: if not l: continue proteins.add(l.strip().replace('"', '').split('\t')[0]) proteins = mapping.map_names(proteins, 'uniprot', 'uniprot') if organism: uniprots = uniprot_input.all_uniprots( organism=organism, swissprot=True, ) proteins = proteins & set(uniprots) return proteins
def uniprot_history_recent_datasheet(identifier): recent_version = uniprot_recent_version(identifier) if recent_version: if recent_version.replaced_by: new = recent_version.replaced_by.split(';')[0] url = urls.urls['uniprot_basic']['datasheet'] % new _logger._log('UniProt ID `%s` is obsolete, has been replaced by ' '`%s`: `%s`.' % ( identifier, new, url, )) return protein_datasheet(new) else: version = int(recent_version.entry_version) url = '%s?version=%u' % ( urls.urls['uniprot_basic']['datasheet'] % identifier, version, ) _logger._log('UniProt ID `%s` is obsolete, downloading archived ' 'version %u: `%s`.' % ( identifier, version, url, )) c = curl.Curl(url, silent=True, large=False) return _protein_datasheet(url) return []
def uniprot_taxonomy(): """ Returns a dictionary with SwissProt IDs as keys and sets of various taxon names as values. """ rename = re.compile(r'\(?(\w[\w\s\',/\.-]+\w)\)?') reac = re.compile(r'\s*\w+\s+\(([A-Z\d]+)\)\s*,') url = urls.urls['uniprot_basic']['speindex'] c = curl.Curl(url, large=True, silent=False) result = collections.defaultdict(set) for line in c.result: if line[0] != ' ': names = set(rename.findall(line)) else: for ac in reac.findall(line): result[ac].update(names) return result
def connectomedb_interactions(): """ Retrieves ligand-receptor interactions from connectomeDB2020 https://asrhou.github.io/NATMI/ """ ConnectomedbInteraction = collections.namedtuple('ConnectomedbInteraction', [ 'ligand', 'ligand_location', 'receptor', 'references', ]) rea = re.compile(r'<a[^>]+>([^<]*)</a>') resemicol = re.compile(r'; ?') url = urls.urls['connectomedb2020']['url'] c = curl.Curl(url, large=True, silent=False) tab = list(csv.DictReader(c.result)) return [ ConnectomedbInteraction( ligand=row['Ligand gene symbol'], ligand_location=resemicol.split(row['Ligand location']), receptor=row['Receptor gene symbol'], references=rea.findall(row['PMID support']), ) for row in tab ]
def smiles2chembl(self, smiles): self.result = {} prg = progress.Progress(total=len(smiles), name='Translating SMILEs', interval=1) for sml in smiles: url = self.chembl_url.format(sml) c = curl.Curl(url, large=False) result = c.result self.result[sml] = [] if result is not None: try: data = json.loads(result) for d in data['compounds']: this_smile = d['smiles'] this_chembl = d['chemblId'] # if this_smile == sml: self.result[sml].append(this_chembl) except ValueError: soup = bs4.BeautifulSoup(result) compounds = soup.find_all('compound') if compounds is not None: for compound in compounds: this_smile = compound.find('smiles').text this_chembl = compound.find('chemblid').text # if this_smile == sml: self.result[sml].append(this_chembl) prg.step() prg.terminate()
def get_pfam_pdb(): c = curl.Curl(urls.urls['pfam_pdb']['url'], silent=False) data = c.result if data is None: return None, None pdb_pfam = {} pfam_pdb = {} data = data.replace('\r', '').split('\n') del data[0] for l in data: l = l.split('\t') if len(l) > 4: pfam = l[4].split('.')[0] pdb = l[0].lower() chain = l[1] start = int(common.non_digit.sub('', l[2])) end = int(common.non_digit.sub('', l[3])) if pdb not in pdb_pfam: pdb_pfam[pdb] = {} if pfam not in pfam_pdb: pfam_pdb[pfam] = {} pdb_pfam[pdb][pfam] = [chain, start, end] pfam_pdb[pfam][pdb] = [chain, start, end] return pdb_pfam, pfam_pdb
def uniprot_data(field, organism=9606, reviewed=True): """ Retrieves a field from UniProt for all proteins of one organism, by default only the reviewed (SwissProt) proteins. For the available fields refer to the ``_uniprot_fields`` attribute of this module or the UniProt website. """ rev = ( ' AND reviewed: yes' if reviewed == True or reviewed == 'yes' else ' AND reviewed: no' if reviewed == False or reviewed == 'no' else '') _field = _uniprot_fields[field] if field in _uniprot_fields else field url = urls.urls['uniprot_basic']['url'] get = { 'query': 'organism:%s%s' % (str(organism), rev), 'format': 'tab', 'columns': 'id,%s' % _field, 'compress': 'yes', } c = curl.Curl(url, get=get, silent=False, large=True, compr='gz') _ = next(c.result) return dict(id_value for id_value in (line.strip('\n\r').split('\t') for line in c.result if line.strip('\n\r')) if id_value[1])
def _protein_datasheet(url): cache = True for a in range(3): c = curl.Curl( url, silent=True, large=False, cache=cache, connect_timeout=( settings.get('uniprot_datasheet_connect_timeout')), timeout=settings.get('uniprot_datasheet_timeout'), ) if not c.result or c.result.startswith('<!DOCTYPE'): cache = False else: break if not c.result: _logger._log('Could not retrieve UniProt datasheet by URL `%s`.' % url) return _redatasheet.findall(c.result) if c.result else []
def adhesome_interactions(): AdhesomeInteraction = collections.namedtuple( 'AdhesomeInteraction', ['source', 'target', 'effect', 'type', 'pmid'], ) url = urls.urls['adhesome']['interactions'] c = curl.Curl(url, large = True, silent = False) data = csv.DictReader(c.result, delimiter = ',') result = [] for rec in data: result.append( AdhesomeInteraction( source = rec['Source'], target = rec['Target'], effect = rec['Effect'], type = common.upper0(rec['Type']), pmid = rec['PMID'], ) ) return result
def phobius_annotations(): rewrongtab = re.compile(r'(\t[A-Z\d]+_[A-Z]+)\t([A-Z]+)\s+(\d)') PhobiusAnnotation = collections.namedtuple('PhobiusAnnotation', [ 'tm_helices', 'signal_peptide', 'cytoplasmic', 'non_cytoplasmic', ]) url = urls.urls['phobius']['url'] c = curl.Curl(url, silent=False, large=True) _ = next(c.result) result = collections.defaultdict(set) for line in c.result: line = rewrongtab.sub(r'\1\2\t\3', line) line = line.strip().split('\t') result[line[1]].add( PhobiusAnnotation( tm_helices=int(line[3]), signal_peptide=line[4] == 'Y', cytoplasmic=line[5].count('i'), non_cytoplasmic=line[5].count('o'), )) return dict(result)
def ramilowski_interactions(putative = False): """ Downloads and processes ligand-receptor interactions from Supplementary Table 2 of Ramilowski 2015. Returns list of lists with ligand and receptor gene symbols, reference and resources as elements. """ c = curl.Curl(urls.urls['rami']['url'], silent = False, large = True) xlsname = c.fname del(c) raw = inputs_common.read_xls(xlsname, 'All.Pairs')[1:] return [ [ r[1], r[3], r[13].replace(' ', ''), # references ';'.join(filter(len, itertools.chain(r[5:11], [r[15]]))) ] for r in raw if r[15] != 'EXCLUDED not ligand' and ( putative or r[15] != 'putative' ) ] return raw
def phosphoelm_enzyme_substrate(organism=9606, ltp_only=True): """ Downloads kinase-substrate interactions from phosphoELM. Returns list of dicts. :param int organism: NCBI Taxonomy ID. :param bool ltp_only: Include only low-throughput interactions. """ result = [] non_digit = re.compile(r'[^\d.-]+') if organism is None: _organism = None elif organism in taxonomy.phosphoelm_taxids: _organism = taxonomy.phosphoelm_taxids[organism] else: sys.stdout.write('\t:: Unknown organism: `%u`.\n' % organism) return [] url = urls.urls['p_elm']['url'] c = curl.Curl(url, silent=False) data = c.result data = [ n for d, n in iteritems(data) if d.startswith(urls.urls['p_elm']['psites']) ] data = data[0] if len(data) > 0 else '' data = [l.split('\t') for l in data.split('\n')] kinases = phosphoelm_kinases() del data[0] for l in data: if (len(l) == 9 and (l[7] == _organism or _organism is None) and (not ltp_only or l[6] == 'LTP')): l[1] = 1 if '-' not in l[0] else int(l[0].split('-')[1]) l[0] = l[0].split('-')[0] del l[-1] if len(l[5]) > 0 and l[5] in kinases: kinase = kinases[l[5]] result.append({ 'instance': None, 'isoform': l[1], 'resaa': l[3], 'resnum': int(non_digit.sub('', l[2])), 'start': None, 'end': None, 'substrate': l[0], 'kinase': kinase, 'references': l[4].split(';'), 'experiment': l[6], 'organism': l[7] }) return result
def _huri_interactions(dataset): reuniprot = re.compile(r'[a-z]+:([\w\.]+)(?:-?([0-9]?))?') rescore = re.compile(r'author score: ([\.0-9]+)') HuriInteraction = collections.namedtuple('HuriInteraction', [ 'uniprot_a', 'uniprot_b', 'isoform_a', 'isoform_b', 'score', ]) def _map_ids(_id): return mapping.map_name( _id, _id[:4].lower() if _id[:4] in {'ensp', 'enst'} else 'uniprot', 'uniprot', ) url = dataset if dataset.startswith('http') else urls.urls['hid'][dataset] c = curl.Curl(url, large=True, silent=False) path = (c.fileobj.name if hasattr(c, 'fileobj') else c.cache_file_name or c.outfile) del c c = curl.FileOpener(path) for row in c.result: score = rescore.search(row) if score: score = float(score.groups()[0]) row = row.split() if len(row) < 2: continue id_a, isoform_a = reuniprot.match(row[0]).groups() id_b, isoform_b = reuniprot.match(row[1]).groups() uniprots_a = _map_ids(id_a) uniprots_b = _map_ids(id_b) for uniprot_a, uniprot_b in itertools.product(uniprots_a, uniprots_b): #pass yield HuriInteraction( uniprot_a=uniprot_a, uniprot_b=uniprot_b, isoform_a=int(isoform_a) if isoform_a else 1, isoform_b=int(isoform_b) if isoform_b else 1, score=score, )