def _read_mapping_uniprot_list(self, source, target, ac_list): """ Reads a mapping table from UniProt "upload lists" service. """ url = urls.urls['uniprot_basic']['lists'] post = { 'from': source, 'format': 'tab', 'to': target, 'uploadQuery': ' '.join(ac_list) } c = curl.Curl(url, post=post, large=True, silent=False) if c.result is None: for i in xrange(3): c = curl.Curl(url, post=post, large=True, silent=False, cache=False) if c.result is not None: break if c.result is None: sys.stdout.write('\t:: Error at downloading from UniProt.\n') return c.result
def query(self, api, param, silent=False, large=False): ''' Retrieves data from the API. @api : str Shold be one of the 10 API sections available. @param : tuple Tuple of the parameters according to the API. @large : bool Passed to the curl wrapper function. If True, the file will be written to disk, and a file object open for reading is returned; if False, the raw data will be returned, in case of JSON, converted to python object, in case of XML, as a string. ''' url = self.urls[api] % param # long timeout is given, because huge files (hundreds MB) take time to # load c = curl.Curl( url, req_headers=self.auth, silent=silent, timeout=1200, large=large) data = c.fileobj self.tmp = c if self.output_format == 'json' and not large: self.result = self.get_json(c.result) else: self.result = c.fileobj
def get_mirbase_aliases(organism=9606): """ Downloads and processes mapping tables from miRBase. """ if type(organism) in common.charTypes: mborganism = organism elif organism not in common.mirbase_taxids: raise ValueError('Organism not known: %u. Try to pass miRBase ' 'taxon prefix as string, e.g. `hsa`.' % organism) else: mborganism = common.mirbase_taxids[organism] mat = {} mir = {} url = urls.urls['mirbase']['aliases'] c = curl.Curl(url, silent=False, large=True) for l in c.result: l = l.decode('utf-8').strip().strip(';').split('\t') if l[1][:3] != mborganism: continue d = mat if l[0][:5] == 'MIMAT' else mir if l[0] not in d: d[l[0]] = set([]) for m in l[1].split(';'): d[l[0]].add(m) return mat, mir
def get_uniprot_sec(organism=9606): """ Downloads and processes the mapping between secondary and primary UniProt IDs. Yields pairs of secondary and primary UniProt IDs. :param int organism: NCBI Taxonomy ID of the organism. """ if organism is not None: proteome = uniprot_input.all_uniprots(organism=organism) proteome = set(proteome) sec_pri = [] url = urls.urls['uniprot_sec']['url'] c = curl.Curl(url, silent=False, large=True) for line in filter( lambda line: len(line) == 2 and (organism is None or line[1] in proteome), map(lambda i: i[1].decode('utf-8').split(), filter(lambda i: i[0] >= 30, enumerate(c.result)))): yield line
def translate(self, source, target, lst): if source == 'inchikey': self.inchikey2anything(target, lst) return None if source == 'smiles': self.smiles2chembl(lst) return None self.result = {} source = str(source) if type(source) is int else self.name_dict[source] target = str(target) if type(target) is int else self.name_dict[target] prg = progress.Progress( total=len(lst), name='Translating compound identifiers', interval=1) for comp in lst: url = '/'.join([self.url_stem, comp, source, target]) c = curl.Curl(url, large = False) result = c.result self.result[comp] = [] if result is not None: data = json.loads(result) for d in data: self.result[comp].append(d['src_compound_id']) prg.step() prg.terminate()
def smiles2chembl(self, smiles): self.result = {} prg = progress.Progress(total=len(smiles), name='Translating SMILEs', interval=1) for sml in smiles: url = self.chembl_url.format(sml) c = curl.Curl(url, large=False) result = c.result self.result[sml] = [] if result is not None: try: data = json.loads(result) for d in data['compounds']: this_smile = d['smiles'] this_chembl = d['chemblId'] # if this_smile == sml: self.result[sml].append(this_chembl) except ValueError: soup = bs4.BeautifulSoup(result) compounds = soup.find_all('compound') if compounds is not None: for compound in compounds: this_smile = compound.find('smiles').text this_chembl = compound.find('chemblid').text # if this_smile == sml: self.result[sml].append(this_chembl) prg.step() prg.terminate()
def load_uniprot_mappings(self, ac_types=None, bi=False, ncbi_tax_id=None): ncbi_tax_id = self.get_tax_id(ncbi_tax_id) tables = self.tables[ncbi_tax_id] ac_types = ac_types if ac_types is not None else self.name_types.keys() # creating empty MappingTable objects: for ac_typ in ac_types: tables[(ac_typ, 'uniprot')] = MappingTable(ac_typ, 'uniprot', 'protein', ac_typ, None, ncbi_tax_id, None, log=self.ownlog) # attempting to load them from Pickle i = 0 for ac_typ in ac_types: md5ac = common.md5((ac_typ, 'uniprot', bi, ncbi_tax_id)) cachefile = os.path.join('cache', md5ac) if self.cache and os.path.isfile(cachefile): tables[(ac_typ, 'uniprot')].mapping = \ pickle.load(open(cachefile, 'rb')) ac_types.remove(ac_typ) tables[(ac_typ, 'uniprot')].mid = md5ac # loading the remaining from the big UniProt mapping file: if len(ac_types) > 0: url = urls.urls['uniprot_idmap_ftp']['url'] c = curl.Curl(url, silent=False, large=True) prg = progress.Progress(c.size, "Processing ID conversion list", 99) for l in c.result: prg.step(len(l)) l = l.decode('ascii').strip().split('\t') for ac_typ in ac_types: if len(l) > 2 and self.name_types[ac_typ] == l[1]: other = l[2].split('.')[0] if l[2] not in tables[(ac_typ, 'uniprot')].mapping['to']: tables[(ac_typ, 'uniprot')].mapping['to'][other] = [] tables[(ac_typ, 'uniprot')].mapping['to'][other].\ append(l[0].split('-')[0]) if bi: uniprot = l[0].split('-')[0] if uniprot not in tables[(ac_typ, 'uniprot')].\ mapping['from']: tables[(ac_typ, 'uniprot')].\ mapping['from'][uniprot] = [] tables[(ac_typ, 'uniprot')].mapping['from'][uniprot].\ append(other) prg.terminate() if self.cache: for ac_typ in ac_types: md5ac = common.md5((ac_typ, bi)) cachefile = os.path.join('cache', md5ac) pickle.dump(tables[(ac_typ, 'uniprot')].mapping, open(cachefile, 'wb'))
def read_mapping_uniprot(self, param, ncbi_tax_id=None): """ Downloads ID mappings directly from UniProt. See the names of possible identifiers here: http://www.uniprot.org/help/programmatic_access :param UniprotMapping param: UniprotMapping instance :param int ncbi_tax_id: Organism NCBI Taxonomy ID. """ ncbi_tax_id = self.get_tax_id(ncbi_tax_id) resep = re.compile(r'[\s;]') if param.__class__.__name__ != "UniprotMapping": self.ownlog.msg(2, "Invalid parameter for read_mapping_uniprot()", 'ERROR') return {} mapping_o = {} mapping_i = {} scolend = re.compile(r'$;') rev = '' if param.swissprot is None \ else ' AND reviewed:%s' % param.swissprot query = 'organism:%u%s' % (int(ncbi_tax_id), rev) self.url = urls.urls['uniprot_basic']['url'] self.post = { 'query': query, 'format': 'tab', 'columns': 'id,%s%s' % (param.field, '' if param.subfield is None else '(%s)' % param.subfield) } self.url = '%s?%s' % (self.url, urllib.urlencode(self.post)) c = curl.Curl(self.url, silent=False) data = c.result self.data = data data = [[[xx] if param.field == 'protein names' else [ xxx for xxx in resep.split(scolend.sub('', xx.strip())) if len(xxx) > 0 ] for xx in x.split('\t') if len(xx.strip()) > 0] for x in data.split('\n') if len(x.strip()) > 0] if len(data) > 0: del data[0] for l in data: if len(l) > 1: l[1] = self.process_protein_name(l[1][0]) \ if param.field == 'protein names' else l[1] for other in l[1]: if other not in mapping_o: mapping_o[other] = [] mapping_o[other].append(l[0][0]) if param.bi: if l[0][0] not in mapping_i: mapping_i[l[0][0]] = [] mapping_i[l[0][0]].append(other) self.mapping['to'] = mapping_o if param.bi: self.mapping['from'] = mapping_i
def get_uniprot_sec(organism=9606): if organism is not None: proteome = uniprot_input.all_uniprots(organism=organism) proteome = set(proteome) sec_pri = [] url = urls.urls['uniprot_sec']['url'] c = curl.Curl(url, silent=False, large=True) data = c.result return filter( lambda line: len(line) == 2 and (organism is None or line[1] in proteome), map(lambda i: i[1].decode('utf-8').split(), filter(lambda i: i[0] >= 30, enumerate(data))))
def all_uniprots(organism=9606, swissprot=None): rev = '' if swissprot is None else ' AND reviewed:%s' % swissprot url = urls.urls['uniprot_basic']['url'] post = { 'query': 'organism:%s%s' % (str(organism), rev), 'format': 'tab', 'columns': 'id' } c = curl.Curl(url, post=post, silent=False) data = c.result return list( filter(lambda x: len(x) > 0, map(lambda l: l.strip(), data.split('\n')[1:])))
def _all_uniprots(organism=9606, swissprot=None): swissprot = 'yes' if swissprot == True else swissprot rev = '' if not swissprot else ' AND reviewed: %s' % swissprot url = urls.urls['uniprot_basic']['url'] get = { 'query': 'organism:%s%s' % (str(organism), rev), 'format': 'tab', 'columns': 'id', } c = curl.Curl(url, get=get, silent=False) data = c.result return [l.strip() for l in data.split('\n')[1:] if l.strip()]
def get_pmid(idList): """ For a list of doi or PMC IDs fetches the corresponding PMIDs. """ if type(idList) in common.simpleTypes: idList = [idList] url = urls.urls['pubmed-eutils']['conv'] % ','.join(str(i) for i in idList) c = curl.Curl(url, silent=True) data = c.result try: js = json.loads(data) except: js = {} return js
def inchikey2anything(self, target, lst): self.result = {} target = str(target) if type(target) is int else self.name_dict[target] prg = progress.Progress( total=len(lst), name='Translating InChi-Keys', interval=1) for inchik in lst: url = self.inchi_stem % inchik c = curl.Curl(url, large = False) result = c.result if result is not None: data = json.loads(result) self.result[inchik] = [ d['src_compound_id'] for d in data if d['src_id'] == target ] prg.step() prg.terminate()
def get_isoforms(organism='H**o sapiens'): reorg = re.compile(r'OS=([A-Z][a-z]+\s[a-z]+)') result = {} url = urls.urls['unip_iso']['url'] c = curl.Curl(url, silent=False) data = c.result data = read_fasta(data) for header, seq in iteritems(data): org = reorg.findall(header) if len(org) > 0 and org[0] == organism: prot = header.split('|')[1].split('-') unip = prot[0] isof = int(prot[1]) if unip not in result: result[unip] = {} result[unip][isof] = seq return result
def setup_resource(self): self.input = self.settings.inFile if callable(self.input): self.resource = self.input(**self.settings.inputArgs) elif isinstance(self.input, common.basestring): if hasattr(dataio, self.input): self.resource = getattr(dataio, self.input)(**self.settings.inputArgs) elif (os.path.exists(self.input) or curl.is_url(self.input)): c = curl.Curl(self.input, **self.settings.curlArgs) self.resource = c.result elif hasattr(self.input, '__iter__'): self.resource = self.input else: self.resource = []
def connectivity_search(self, id_list, id_type, parameters=[1, 0, 0, 0, 0, 1, 0]): ''' [1,0,0,0,0,1,0, 1] ''' ''' parameters is a list of parameters A-H as described in https://www.ebi.ac.uk/unichem/info/widesearchInfo ''' parameters.append(1) # H parameter must be 1 to process the result parameters = [str(i) for i in parameters] self.result = {} if id_type == 'inchikey': id_type = '' method = 'key_search' elif id_type == 'smiles': self.result = None return None else: id_type = str( id_type) if type(id_type) is int else self.name_dict[id_type] id_type = '%s/' % id_type method = 'cpd_search' prg = progress.Progress(total=len(id_list), name='Connectivity search', interval=1) for i in id_list: prg.step() url = self.cpd_search.format(method, i, id_type, '/'.join(parameters)) c = curl.Curl(url, large=False) result = c.result self.result[i] = [] if result is not None: data = json.loads(result) for k, v in iteritems(data): for j in range(1, len(v)): self.result[i].append(v[j][0]) self.result[i] = list(set(self.result[i])) prg.terminate()
def swissprot_seq(organism=9606, isoforms=False): """ Loads all sequences for an organism, optionally for all isoforms, by default only first isoform. """ result = {} url = urls.urls['uniprot_basic']['url'] post = { 'query': 'organism:%s AND reviewed:yes' % str(organism), 'format': 'tab', 'columns': 'id,sequence' } c = curl.Curl(url, post=post, silent=False, timeout=900) data = c.result data = data.split('\n') del data[0] for l in data: l = l.strip().split('\t') if len(l) == 2: result[l[0]] = Seq(l[0], l[1]) if isoforms: data = get_isoforms(organism=organism) for unip, isoforms in iteritems(data): for isof, seq in iteritems(isoforms): if unip in result: result[unip].add_seq(seq, isof) return result
def swissprot_seq(organism=9606, isoforms=False): taxids = {9606: 'H**o sapiens'} result = {} url = urls.urls['uniprot_basic']['url'] post = { 'query': 'organism:%s AND reviewed:yes' % str(organism), 'format': 'tab', 'columns': 'id,sequence' } c = curl.Curl(url, post=post, silent=False) data = c.result data = data.split('\n') del data[0] for l in data: l = l.strip().split('\t') if len(l) == 2: result[l[0]] = se.Seq(l[0], l[1]) if isoforms: data = get_isoforms() for unip, isoforms in iteritems(data): for isof, seq in iteritems(isoforms): if unip in result: result[unip].add_seq(seq, isof) return result
def get_isoforms(organism=9606): """ Loads UniProt sequences for all isoforms. """ if organism in common.phosphoelm_taxids: organism = common.phosphoelm_taxids[organism] reorg = re.compile(r'OS=([A-Z][a-z]+\s[a-z]+)') result = {} url = urls.urls['unip_iso']['url'] c = curl.Curl(url, silent=False) data = c.result data = read_fasta(data) for header, seq in iteritems(data): org = reorg.findall(header) if len(org) > 0 and org[0] == organism: prot = header.split('|')[1].split('-') unip = prot[0] isof = int(prot[1]) if unip not in result: result[unip] = {} result[unip][isof] = seq return result
def ptm_orthology(self): """ Creates an orthology translation dict of phosphosites based on phosphorylation sites table from PhosphoSitePlus. In the result all PTMs represented by a tuple of the following 6 elements: UniProt ID, isoform (int), residue one letter code, residue number (int), NCBI Taxonomy ID (int), modification type. """ self.ptmhomo = {} nondigit = re.compile(r'[^\d]+') unknown_taxa = set([]) for typ in common.psite_mod_types: groups = {} url = urls.urls['psite_%s' % typ[0]]['url'] c = curl.Curl(url, silent=False, large=True) data = c.result for _ in xrange(4): null = next(data) for r in data: r = r.decode('utf-8').split('\t') if len(r) < 10: continue uniprot = r[2] isoform = 1 if '-' not in uniprot else int( uniprot.split('-')[1]) uniprot = uniprot.split('-')[0] aa = r[4][0] num = int(nondigit.sub('', r[4])) if r[6] not in common.taxa: unknown_taxa.add(r[6]) continue tax = common.taxa[r[6]] group = int(r[5]) this_site = (uniprot, isoform, aa, num, tax, typ[1]) if group not in groups: groups[group] = set([]) groups[group].add(this_site) for group, sites in iteritems(groups): for site1 in sites: for site2 in sites: if site1[4] == site2[4]: continue if site1 not in self.ptmhomo: self.ptmhomo[site1] = {} if site2[4] not in self.ptmhomo[site1]: self.ptmhomo[site1][site2[4]] = set([]) self.ptmhomo[site1][site2[4]].add(site2) if len(unknown_taxa): self._log('Unknown taxa encountered: %s' % (', '.join(sorted(unknown_taxa))))