def addMolweight(path, vsChembl, user, pword, host, port): import queryDevice import os import time infile = open(path, 'r') lines = infile.readlines() infile.close() out = open('_'.join([path,"sed"]) ,'w') out.write('%s\tmolweight\tlogP\n'%lines[0].rstrip('\n')) els = lines[0].split('\t') for i, el in enumerate(els): if el == 'molregno': idx = i break for line in lines[1:]: time.sleep(0.03) elements = line.split('\t') molregno = elements[idx] try: mw = queryDevice.queryDevice("SELECT mw_freebase FROM compound_properties WHERE molregno = %s "% molregno, vsChembl, user, pword, host, port)[0][0] except IndexError: mw = None try: logP = queryDevice.queryDevice("SELECT alogp FROM compound_properties WHERE molregno = %s" % molregno, vsChembl, user, pword, host, port)[0][0] except IndexError: mw = None out.write("%s\t%s\t%s\n"%(line.rstrip('\n'), mw, logP )) out.close() os.system('mv %s %s'% ('_'.join([path,"sed"]), path))
def actSizeMap(chemblTargets, release, user, pword, host, port): import queryDevice multi = queryDevice.queryDevice("SELECT DISTINCT activity_id FROM map_pfam WHERE mapType = 'multi'", release, user, pword, host, port) single = queryDevice.queryDevice("SELECT DISTINCT activity_id FROM map_pfam WHERE mapType = 'single'", release, user, pword, host, port) conflict = queryDevice.queryDevice("SELECT DISTINCT activity_id FROM map_pfam WHERE mapType = 'conflict'", release, user, pword, host, port) return single,multi, conflict
def groupSizeMap(chemblTargets, release, user, pword, host, port): import queryDevice multi = queryDevice.queryDevice("SELECT DISTINCT protein_accession FROM map_pfam WHERE mapType = 'multi'", release, user, pword, host, port) single = queryDevice.queryDevice("SELECT DISTINCT protein_accession FROM map_pfam WHERE mapType = 'single'", release, user, pword, host, port) conflict = queryDevice.queryDevice("SELECT DISTINCT protein_accession FROM map_pfam WHERE mapType = 'conflict'", release, user, pword, host, port) return single,multi, conflict
def getUniprotTargets(release, user, pword, host, port): release_number = int(release.split('_')[1]) if release_number >= 15: rawtargets = queryDevice.queryDevice("""SELECT cs.accession FROM component_sequences cs JOIN target_components tc ON tc.component_id = cs.component_id WHERE db_source IN('SWISS-PROT', 'TREMBL')""", release, user, pword, host, port) else: rawtargets = queryDevice.queryDevice("""SELECT protein_accession FROM target_dictionary WHERE db_source IN('SWISS-PROT', 'TREMBL')""", release, user, pword, host, port) targets= [] for target in rawtargets: targets.append(target[0]) return targets
def researchCode(pfamDict, release, user, pword, host, port): import time import queryDevice infile = open('data/map_pfam.txt','r') lines = infile.readlines() out = open('data/resCodeExamples.tab', 'w') out.write('research_code\tpfam\tuniprot\tnPfam\n') for line in lines[1:]: elements = line.split('\t') molregno = elements[2] pfam = elements[1] uniprot = elements[3] nPfam = len(pfamDict[uniprot]['domains']) time.sleep(0.03) resCode = queryDevice.queryDevice("SELECT synonyms FROM molecule_synonyms WHERE syn_type = 'RESEARCH_CODE' AND molregno = %s" %molregno, release, user, pword, host, port) try: resCode = resCode[0][0] out.write('%s\t%s\t%s\t%s\n'%(resCode, pfam, uniprot, nPfam)) except IndexError: pass out.close() return
def countLigs(humanTargets, chemblTargets, release, user, pword, host, port): import queryDevice ligandPerDom = {} ligandPerTarget = {} for humGene in humanTargets: if not humGene in chemblTargets: #print 'could not find', humGene continue qRes = queryDevice.queryDevice("SELECT domain, molregno FROM map_pfam WHERE protein_accession = '%s'"%humGene , release, user, pword, host, port) for element in qRes: domain = element[0] lig = element[1] try: ligandPerDom[domain][lig]= 0 except KeyError: ligandPerDom[domain] = {} ligandPerDom[domain][lig]= 0 try: ligandPerTarget[humGene][lig] = 0 except KeyError: ligandPerTarget[humGene] = {} ligandPerTarget[humGene][lig] = 0 domOut = open('data/domLigs.tab','w') domOut.write('domain\tfreq\n') targetOut = open('data/targLigs.tab','w') targetOut.write('domain\tfreq\n') for domain in ligandPerDom: numLigs = len(ligandPerDom[domain].keys()) domOut.write('%s\t%s\n'%(domain,numLigs)) domOut.close() for target in ligandPerTarget: numLigs = len(ligandPerTarget[target].keys()) targetOut.write('%s\t%s\n'%(target,numLigs)) targetOut.close()
def getRatio(pfamDict,humanTargets, release, user, pword, host, port): import numpy as np import queryDevice for target in pfamDict.keys(): pfamDict[target]['ratio']='NA' try: seq = humanTargets[target] seq_len = len(seq)-1 except KeyError: seq= queryDevice.queryDevice("SELECT protein_sequence FROM target_dictionary WHERE protein_accession = '%s'"%target, release, user, pword, host, port) try: seq_len = len(seq[0][0])-1 except IndexError: continue dom_len = 0 for i in range(len(pfamDict[target]['domains'])): start = pfamDict[target]['start'][i] end = pfamDict[target]['end'][i] ind_dom = end - start dom_len += ind_dom ratio = np.true_divide(dom_len,seq_len) pfamDict[target]['ratio'] = ratio return pfamDict
def get_el_targets(params): """Query the ChEMBL database for (almost) all activities that are subject to the mapping. Does not conver activities expressed in log-conversion eg pIC50 etc. This function works with chembl_15 upwards. Outputs a list of tuples [(tid, target_type, domain_count, assay_count, act_count),...] """ data = queryDevice.queryDevice(""" SELECT DISTINCT dc.tid, dc.target_type, dc.dc, COUNT(DISTINCT act.assay_id), COUNT(DISTINCT activity_id) FROM assays ass JOIN( SELECT td.tid, td.target_type, COUNT(cd.domain_id) as dc FROM target_dictionary td JOIN target_components tc ON tc.tid = td.tid JOIN component_sequences cs ON cs.component_id = tc.component_id JOIN component_domains cd ON cd.component_id = cs.component_id WHERE td.target_type IN('SINGLE PROTEIN', 'PROTEIN COMPLEX') GROUP BY td.tid ) as dc ON dc.tid = ass.tid JOIN activities act ON act.assay_id = ass.assay_id WHERE act.standard_type IN('Ki','Kd','IC50','EC50', 'AC50') AND ass.relationship_type = 'D' AND assay_type IN('B') AND act.standard_relation IN('=') AND standard_units = 'nM' AND standard_value <= %s GROUP BY dc.tid ORDER BY COUNT(activity_id)""" % (int(params['threshold']) * 1000) , params) print "retrieved data for ", len(data), "tids." return data
def interAssay(): import queryDevice import mkDict import queries import writePairs import os import addProperties import readIds import string import yaml # Read config file. paramFile = open('gla.yaml') params = yaml.safe_load(paramFile) species = params['species'] # Get information for all relevant activities from ChEMBL. for spec in species: specName = string.replace(spec, ' ','_') dictFile = "data/inter_compDict_%s_%s.pkl" % (specName, params['release']) results = "data/interAssay_%s_%s.tab" % (specName, params['release']) query = queries.activities(spec) acts = queryDevice.queryDevice(query, params) mkDict.activities(acts, dictFile) writePairs.interAssaySampled(results, dictFile) addProperties.addMolweight("molregno", results, params) addProperties.addTargetClass("L1","accession", results, params) addProperties.addSeq100(results)
def getRatio(pfamDict, humanTargets, release, user, pword, host, port): import numpy as np import queryDevice for target in pfamDict.keys(): pfamDict[target]['ratio'] = 'NA' try: seq = humanTargets[target] seq_len = len(seq) - 1 except KeyError: seq = queryDevice.queryDevice( "SELECT protein_sequence FROM target_dictionary WHERE protein_accession = '%s'" % target, release, user, pword, host, port) try: seq_len = len(seq[0][0]) - 1 except IndexError: continue dom_len = 0 for i in range(len(pfamDict[target]['domains'])): start = pfamDict[target]['start'][i] end = pfamDict[target]['end'][i] ind_dom = end - start dom_len += ind_dom ratio = np.true_divide(dom_len, seq_len) pfamDict[target]['ratio'] = ratio return pfamDict
def get_el_targets(RELEASE, USER, PWORD, HOST, PORT): data = queryDevice.queryDevice(""" SELECT DISTINCT td.protein_accession, dc.dc, COUNT(DISTINCT activity_id) FROM target_dictionary td JOIN( SELECT pd.protein_accession, COUNT(protein_accession) as dc FROM pfam_domains pd GROUP BY pd.protein_accession ) as dc ON dc.protein_accession = td.protein_accession JOIN assay2target a2t ON td.tid = a2t.tid JOIN assays a ON a.assay_id = a2t.assay_id JOIN activities act ON act.assay_id = a.assay_id WHERE act.standard_type IN('Ki','Kd','IC50','EC50', 'AC50') AND a2t.multi=0 AND a2t.complex =0 AND a2t.relationship_type = 'D' AND act.relation = '=' AND assay_type = 'B' AND standard_value < 50000 AND dc.dc > 1 GROUP BY td.protein_accession ORDER BY COUNT(DISTINCT activity_id)""", RELEASE, USER, PWORD, HOST, PORT) return data
def addTargetClass(level, key, path, params): import queryDevice import os infile = open(path, 'r') lines = infile.readlines() infile.close() out = open('_'.join([path,"sed"]) ,'w') out.write('%s\ttarget_class_%s\n'%(lines[0].rstrip('\n'), level)) header = lines[0].split('\t') accessions = {} for i, col in enumerate(header): if col == key: idx = i break for line in lines[1:]: elements = line.split('\t') acc = elements[idx] accessions[acc] = 0 accStr = "','".join(map(str, accessions.keys())) data = queryDevice.queryDevice("SELECT protein_accession, %s FROM target_class tc JOIN target_dictionary td ON td.tid = tc.tid WHERE protein_accession IN('%s') "%(level, accStr), params) for tup in data: acc = tup[0] targetClass = tup[1] accessions[acc] = targetClass for line in lines[1:]: elements = line.split('\t') acc = elements[idx] targetClass = accessions[acc] out.write("%s\t%s\n"%(line.rstrip('\n'), targetClass )) out.close() os.system('mv %s %s'% ('_'.join([path,"sed"]), path))
def getIntacts(uniDict , release, user, pword, host, port): import pickle import queryDevice chemblIds = uniDict.keys() idString = "\',\'".join(chemblIds) intactPairs = queryDevice.queryDevice("SELECT md.chembl_id, td.protein_accession \ FROM activities act \ JOIN assay2target a2t ON act.assay_id = a2t.assay_id \ JOIN target_dictionary td ON a2t.tid = td.tid \ JOIN molecule_dictionary md ON act.molregno = md.molregno \ JOIN compound_properties cp ON md.molregno = cp.molregno \ WHERE td.protein_accession IS NOT NULL AND cp.mw_freebase <= 1000 AND md.chembl_id IN('%s')" %idString, release, user, pword, host, port) intactDict = {} molDict = {} for pair in intactPairs: chembl_id = pair[0] target = pair[1] try: intactDict[target][chembl_id] = {} except KeyError: intactDict[target] = {} intactDict[target][chembl_id] = {} return intactDict
def mkIntactDictAllPDBs(release, user, pword, host, port): import pickle import queryDevice intactPairs = queryDevice.queryDevice( "SELECT cr.molregno, compound_key,protein_accession \ FROM compound_records cr \ JOIN activities act ON cr.molregno = act.molregno \ JOIN assay2target a2t ON act.assay_id = a2t.assay_id \ JOIN target_dictionary td ON a2t.tid = td.tid \ WHERE src_id = 6 AND protein_accession IS NOT NULL", release, user, pword, host, port, ) intactDict = {} molDict = {} for pair in intactPairs: molregno = pair[0] cmpdId = pair[1] target = pair[2] molDict[molregno] = cmpdId try: intactDict[target][molregno] = {} except KeyError: intactDict[target] = {} intactDict[target][molregno] = {} return (intactDict, molDict)
def groupSizeMap(chemblTargets, release, user, pword, host, port): import queryDevice multi = queryDevice.queryDevice( "SELECT DISTINCT protein_accession FROM map_pfam WHERE mapType = 'multi'", release, user, pword, host, port) single = queryDevice.queryDevice( "SELECT DISTINCT protein_accession FROM map_pfam WHERE mapType = 'single'", release, user, pword, host, port) conflict = queryDevice.queryDevice( "SELECT DISTINCT protein_accession FROM map_pfam WHERE mapType = 'conflict'", release, user, pword, host, port) return single, multi, conflict
def researchCode(pfamDict, release, user, pword, host, port): import time import queryDevice infile = open('data/map_pfam.txt', 'r') lines = infile.readlines() out = open('data/resCodeExamples.tab', 'w') out.write('research_code\tpfam\tuniprot\tnPfam\n') for line in lines[1:]: elements = line.split('\t') molregno = elements[2] pfam = elements[1] uniprot = elements[3] nPfam = len(pfamDict[uniprot]['domains']) time.sleep(0.03) resCode = queryDevice.queryDevice( "SELECT synonyms FROM molecule_synonyms WHERE syn_type = 'RESEARCH_CODE' AND molregno = %s" % molregno, release, user, pword, host, port) try: resCode = resCode[0][0] out.write('%s\t%s\t%s\t%s\n' % (resCode, pfam, uniprot, nPfam)) except IndexError: pass out.close() return
def getLigandsForTarget(target, release, user, pword, host, port): import queryDevice ligands = queryDevice.queryDevice("SELECT DISTINCT act.molregno, standard_value,\ standard_type, standard_units, canonical_smiles, act.relation, act.activity_id \ \ FROM activities act \ JOIN assay2target a2t \ ON act.assay_id = a2t.assay_id\ JOIN target_dictionary td \ ON a2t.tid = td.tid \ JOIN assays ass \ ON ass.assay_id = act.assay_id \ JOIN compound_structures cs \ ON cs.molregno=act.molregno \ \ WHERE td.protein_accession = '%s' \ AND ass.assay_type='B' \ AND act.relation ='=' \ AND a2t.multi=0 \ AND a2t.complex=0 \ AND a2t.relationship_type = 'D'\ AND act.standard_type IN('Ki','Kd','IC50', \ 'EC50','-Log Ki','pKd' , 'pA2', 'pI', 'pKa')" \ %target, release, user, pword, host, port) return ligands
def addTargetClass(level, key, path, vsChembl, user, pword, host, port): import queryDevice import os import time infile = open(path, 'r') lines = infile.readlines() infile.close() out = open('_'.join([path,"sed"]) ,'w') out.write('%s\ttargetClass_%s\n'%(lines[0].rstrip('\n'),level) ) header = lines[0].split('\t') for i, col in enumerate(header): if col == key: idx = i break for line in lines[1:]: time.sleep(0.03) elements = line.split('\t') uniprot = elements[idx] targetClass = queryDevice.queryDevice("SELECT %s FROM target_class tc JOIN target_dictionary td ON td.tid = tc.tid WHERE protein_accession = '%s' "%(level, uniprot),vsChembl, user, pword, host, port) try: targetClass = targetClass[0][0] except IndexError: targetClass = None out.write("%s\t%s\n"%(line.rstrip('\n'), targetClass )) out.close() os.system('mv %s %s'% ('_'.join([path,"sed"]), path))
def get_doms(tids, params): """Get domains for a list of tids. Inputs: el_targets -- list of eligible targets """ pfam_lkp = {} tidstr = "', '".join(str(t) for t in tids) data = queryDevice.queryDevice( """ SELECT tid, domain_name FROM target_components tc JOIN component_domains cd ON cd.component_id = tc.component_id JOIN domains d ON d.domain_id = cd.domain_id WHERE tc.tid IN('%s') and domain_type = 'Pfam-A'""" % tidstr, params) for ent in data: tid = ent[0] dom = ent[1] try: pfam_lkp[tid].append(dom) except KeyError: pfam_lkp[tid] = [dom] return pfam_lkp
def get_el_targets(params): """Query the ChEMBL database for (almost) all activities that are subject to the mapping. Does not conver activities expressed in log-conversion eg pIC50 etc. This function works with chembl_15 upwards. Outputs a list of tuples [(tid, target_type, domain_count, assay_count, act_count),...] """ data = queryDevice.queryDevice( """ SELECT DISTINCT dc.tid, dc.target_type, dc.dc, COUNT(DISTINCT act.assay_id), COUNT(DISTINCT activity_id) FROM assays ass JOIN( SELECT td.tid, td.target_type, COUNT(cd.domain_id) as dc FROM target_dictionary td JOIN target_components tc ON tc.tid = td.tid JOIN component_sequences cs ON cs.component_id = tc.component_id JOIN component_domains cd ON cd.component_id = cs.component_id WHERE td.target_type IN('SINGLE PROTEIN', 'PROTEIN COMPLEX') GROUP BY td.tid ) as dc ON dc.tid = ass.tid JOIN activities act ON act.assay_id = ass.assay_id WHERE act.standard_type IN('Ki','Kd','IC50','EC50', 'AC50') AND ass.relationship_type = 'D' AND assay_type IN('B') AND act.standard_relation IN('=') AND standard_units = 'nM' AND standard_value <= %s GROUP BY dc.tid ORDER BY COUNT(activity_id)""" % (int(params['threshold']) * 1000), params) print "retrieved data for ", len(data), "tids." return data
def actSizeMap(chemblTargets, release, user, pword, host, port): import queryDevice multi = queryDevice.queryDevice( "SELECT DISTINCT activity_id FROM map_pfam WHERE mapType = 'multi'", release, user, pword, host, port) single = queryDevice.queryDevice( "SELECT DISTINCT activity_id FROM map_pfam WHERE mapType = 'single'", release, user, pword, host, port) conflict = queryDevice.queryDevice( "SELECT DISTINCT activity_id FROM map_pfam WHERE mapType = 'conflict'", release, user, pword, host, port) return single, multi, conflict
def addChembl_id(key, path, params): import queryDevice import os infile = open(path, 'r') lines = infile.readlines() infile.close() out = open('_'.join([path,"sed"]) ,'w') out.write('%s\tchembl_id\n'%lines[0].rstrip('\n') ) header = lines[0].split('\t') molregnos = {} for i, col in enumerate(header): if col == key: idx = i break for line in lines[1:]: elements = line.split('\t') molregno = int(elements[idx]) molregnos[molregno] = 0 molstr = "','".join(map(str, molregnos.keys())) print "Looking up chembl_id for ", len(molregnos.keys()), "cmpds." data = queryDevice.queryDevice("SELECT distinct molregno, chembl_id FROM molecule_dictionary WHERE molregno IN('%s')"% molstr, params) for tup in data: molregno = int(tup[0]) chembl_id = tup[1] molregnos[molregno] = chembl_id for line in lines[1:]: elements = line.split('\t') molregno = int(elements[idx]) chembl_id = molregnos[molregno] out.write("%s\t%s\n"%(line.rstrip('\n'), chembl_id )) out.close() os.system('mv %s %s'% ('_'.join([path,"sed"]), path))
def addMolweight(key, path, params): import queryDevice import os infile = open(path, 'r') lines = infile.readlines() infile.close() out = open('_'.join([path,"sed"]) ,'w') out.write('%s\tmolweight\n'%lines[0].rstrip('\n') ) header = lines[0].split('\t') molregnos = {} for i, col in enumerate(header): if col == key: idx = i break for line in lines[1:]: elements = line.split('\t') molregno = int(elements[idx]) molregnos[molregno] = 0 molstr = "','".join(map(str, molregnos.keys())) print "Looking up mw_freebase for ", len(molregnos.keys()), "cmpds." data = queryDevice.queryDevice("SELECT distinct molregno, mw_freebase FROM compound_properties WHERE molregno IN('%s')"% molstr, params) for tup in data: molregno = int(tup[0]) molweight = tup[1] molregnos[molregno] = molweight for line in lines[1:]: elements = line.split('\t') molregno = int(elements[idx]) molweight = molregnos[molregno] out.write("%s\t%s\n"%(line.rstrip('\n'), molweight )) out.close() os.system('mv %s %s'% ('_'.join([path,"sed"]), path))
def addPrefName(key, path, params): import queryDevice import os infile = open(path, 'r') lines = infile.readlines() infile.close() out = open('_'.join([path,"sed"]) ,'w') out.write('%s\tprefName_%s\n'%(lines[0].rstrip('\n'), key)) header = lines[0].split('\t') accessions = {} for i, col in enumerate(header): if col == key: idx = i break for line in lines[1:]: elements = line.split('\t') acc = elements[idx] accessions[acc] = 0 accStr = "','".join(map(str, accessions.keys())) data = queryDevice.queryDevice("SELECT protein_accession, pref_name FROM target_dictionary WHERE protein_accession IN('%s') "% accStr, params) for tup in data: acc = tup[0] prefName = tup[1] accessions[acc] = prefName for line in lines[1:]: elements = line.split('\t') acc = elements[idx] prefName = accessions[acc] out.write("%s\t\"%s\"\n"%(line.rstrip('\n'), prefName)) out.close() os.system('mv %s %s'% ('_'.join([path,"sed"]), path))
def getLigandsForTarget(target, release, user, pword, host, port): import queryDevice ligands = queryDevice.queryDevice("SELECT DISTINCT act.molregno, standard_value,\ standard_type, standard_units, canonical_smiles, act.standard_relation, act.activity_id \ \ FROM activities act \ JOIN assays ass \ ON ass.assay_id = act.assay_id \ JOIN compound_records cr \ ON act.molregno = cr.molregno \ JOIN molecule_dictionary md \ ON md.molregno = cr.molregno \ JOIN molecule_hierarchy mh \ ON cr.molregno = mh.molregno \ JOIN target_dictionary td \ ON ass.tid = td.tid \ JOIN target_components tc \ ON td.tid = tc.tid \ JOIN component_sequences cos \ ON tc.component_id = cos.component_id \ JOIN compound_structures cs \ ON cs.molregno=act.molregno \ \ WHERE cos.accession = '%s' \ AND ass.assay_type='B' \ AND ass.relationship_type = 'D'\ AND mh.active_molregno = mh.parent_molregno \ AND md.first_approval is not NULL \ AND act.standard_type IN('Ki','Kd','IC50', \ 'EC50','-Log Ki','pKd' , 'pA2', 'pI', 'pKa')" \ %target, 'ChEMBL_%s' %release, user, pword, host, port) return ligands
def toSam(conflicts, threshold, user, pword, host, release, port): import parse import getLigands import filterForTarget import queryDevice conf = {} for confStr in conflicts.keys(): for target in conflicts[confStr]: ligands = getLigands.getLigandsForTarget(target, release, user, pword, host, port) ligands = filterForTarget.filterForTarget(ligands, threshold) for ligand in ligands: molregno = ligand[2] actId = ligand[3] pubmed = queryDevice.queryDevice( "SELECT pubmed_id FROM docs JOIN activities act ON act.doc_id = docs.doc_id WHERE activity_id = %s" % actId, release, user, pword, host, port)[0][0] pubmed = pubmed try: conf[confStr][molregno]['actId'].append(actId) conf[confStr][molregno]['pubmed'].append(pubmed) conf[confStr][molregno]['pubmed'] = [] conf[confStr][molregno]['pubmed'].append(pubmed) except KeyError: try: conf[confStr][molregno] = {} conf[confStr][molregno]['actId'] = [] conf[confStr][molregno]['actId'].append(actId) conf[confStr][molregno]['pubmed'] = [] conf[confStr][molregno]['pubmed'].append(pubmed) except KeyError: conf[confStr] = {} conf[confStr][molregno] = {} conf[confStr][molregno]['actId'] = [] conf[confStr][molregno]['actId'].append(actId) conf[confStr][molregno]['pubmed'] = [] conf[confStr][molregno]['pubmed'].append(pubmed) confLkp = {} for confStr in conf.keys(): confLkp[confStr] = 0 for confStr in confLkp.keys(): out = open('data/forSam_%s.pred' % confStr, 'w') out.write('molregno\tpubmed\tprediction\tactivity_id\n') for conflict in conf[confStr]: for molregno in conf[confStr].keys(): pubmed = conf[confStr][molregno]['pubmed'] domain = 'None' actId = conf[confStr][molregno]['actId'] for act in actId: out.write('%s\t%s\t%s\t%s\n' % (molregno, pubmed[0], domain, act)) out.close()
def retrieve_acts(params): """Run a query for chembl_id, canonical_smiles, molformula. Inputs: params -- dictionary holding details of the connection string """ acts = queryDevice.queryDevice("SELECT md.chembl_id, cs.canonical_smiles, cs.molformula from molecule_dictionary md JOIN compound_structures cs ON md.molregno = cs.molregno" ,params['release'], params['user'], params['pword'], params['host'], params['port']) return acts
def getTargets(release, user, pword, host, port): #release_number = int(release.split('_')[1]) release_number = int(release) if release_number >= 15: rawtargets = queryDevice.queryDevice("""SELECT DISTINCT accession FROM component_sequences WHERE ORGANISM = 'H**o sapiens'""", 'ChEMBL_%s' %release, user, pword, host, port) targets= [] for target in rawtargets: targets.append(target[0]) return targets
def retrieve_acts(params): """Run a query for chembl_id, canonical_smiles, molformula. Inputs: params -- dictionary holding details of the connection string """ acts = queryDevice.queryDevice( "SELECT md.chembl_id, cs.canonical_smiles, cs.molformula from molecule_dictionary md JOIN compound_structures cs ON md.molregno = cs.molregno", params['release'], params['user'], params['pword'], params['host'], params['port']) return acts
def getUniprotTargets(release, user, pword, host, port): import queryDevice rawtargets = queryDevice.queryDevice("""SELECT cs.accession, cs.component_id, tid FROM component_sequences cs JOIN target_components tc ON tc.component_id = cs.component_id WHERE db_source IN('SWISS-PROT', 'TREMBL')""", release, user, pword, host, port) targets= [] tids = [] for target in rawtargets: targets.append(target[0]) return targets
def toSam(conflicts, threshold, user, pword, host, release, port): import parse import getLigands import filterForTarget import queryDevice conf = {} for confStr in conflicts.keys(): for target in conflicts[confStr]: ligands = getLigands.getLigandsForTarget(target, release, user, pword, host, port) ligands = filterForTarget.filterForTarget(ligands, threshold) for ligand in ligands: molregno = ligand[2] actId = ligand[3] pubmed = queryDevice.queryDevice("SELECT pubmed_id FROM docs JOIN activities act ON act.doc_id = docs.doc_id WHERE activity_id = %s"%actId,release, user, pword, host, port)[0][0] pubmed = pubmed try: conf[confStr][molregno]['actId'].append(actId) conf[confStr][molregno]['pubmed'].append(pubmed) conf[confStr][molregno]['pubmed'] = [] conf[confStr][molregno]['pubmed'].append(pubmed) except KeyError: try: conf[confStr][molregno] = {} conf[confStr][molregno]['actId'] = [] conf[confStr][molregno]['actId'].append(actId) conf[confStr][molregno]['pubmed'] = [] conf[confStr][molregno]['pubmed'].append(pubmed) except KeyError: conf[confStr]={} conf[confStr][molregno] = {} conf[confStr][molregno]['actId'] = [] conf[confStr][molregno]['actId'].append(actId) conf[confStr][molregno]['pubmed'] = [] conf[confStr][molregno]['pubmed'].append(pubmed) confLkp = {} for confStr in conf.keys(): confLkp[confStr] = 0 for confStr in confLkp.keys(): out = open('data/forSam_%s.pred'%confStr, 'w') out.write('molregno\tpubmed\tprediction\tactivity_id\n') for conflict in conf[confStr]: for molregno in conf[confStr].keys(): pubmed = conf[confStr][molregno]['pubmed'] domain = 'None' actId = conf[confStr][molregno]['actId'] for act in actId: out.write('%s\t%s\t%s\t%s\n'%(molregno, pubmed[0], domain, act)) out.close()
def getUniprotTargets(release, user, pword, host, port): import queryDevice rawtargets = queryDevice.queryDevice( """SELECT cs.accession, cs.component_id, tid FROM component_sequences cs JOIN target_components tc ON tc.component_id = cs.component_id WHERE db_source IN('SWISS-PROT', 'TREMBL')""", release, user, pword, host, port) targets = [] tids = [] for target in rawtargets: targets.append(target[0]) return targets
def getLigandsForActivity(activity, release, user, pword, host, port): import queryDevice ligands = queryDevice.queryDevice("SELECT DISTINCT act.molregno, standard_value,\ standard_type, standard_units, canonical_smiles, relation, act.activity_id \ \ FROM activities act \ JOIN compound_structures cs\ ON act.molregno = cs.molregno \ WHERE activity_id = %s" \ %activity, release, user, pword, host, port) return ligands
def getUniprotTargets(release, user, pword, host, port): import queryDevice rawtargets = queryDevice.queryDevice("SELECT protein_accession\ FROM target_dictionary WHERE db_source IN('SWISS-PROT', 'TREMBL')", release, user, pword, host, port) targets= [] tids = [] for target in rawtargets: targets.append(target[0]) return targets
def add_target_class(level, key, path): '''Add target class to each row in a table using a Uniprot accession for release < ChEMBL 15. Inputs: level: the target class annotation level key: the keyword in the header indicating a Uniprot Id path: path to the file. -------------------- Felix Kruger [email protected] ''' infile = open(path, 'r') lines = infile.readlines() infile.close() out = open('_'.join([path, "sed"]), 'w') out.write('%s\ttarget_class_%s\n' % (lines[0].rstrip('\n'), level)) header = lines[0].split('\t') accessions = {} for i, col in enumerate(header): if col == key: idx = i break for line in lines[1:]: elements = line.split('\t') acc = elements[idx] accessions[acc] = None accStr = "','".join(map(str, accessions.keys())) print len(accessions.keys()) data = queryDevice.queryDevice( """SELECT protein_accession, %s FROM target_class tc JOIN target_dictionary td ON td.tid = tc.tid WHERE protein_accession IN('%s')""" % (level, accStr), release, user, pword, host, port) for tup in data: acc = tup[0] targetClass = tup[1] accessions[acc] = targetClass for line in lines[1:]: elements = line.split('\t') acc = elements[idx] targetClass = accessions[acc] out.write("%s\t%s\n" % (line.rstrip('\n'), targetClass)) out.close() os.system('mv %s %s' % ('_'.join([path, "sed"]), path))
def add_target_class(level, key, path): '''Add target class to each row in a table using a Uniprot accession for release < ChEMBL 15. Inputs: level: the target class annotation level key: the keyword in the header indicating a Uniprot Id path: path to the file. -------------------- Felix Kruger [email protected] ''' infile = open(path, 'r') lines = infile.readlines() infile.close() out = open('_'.join([path,"sed"]) ,'w') out.write('%s\ttarget_class_%s\n'%(lines[0].rstrip('\n'), level)) header = lines[0].split('\t') accessions = {} for i, col in enumerate(header): if col == key: idx = i break for line in lines[1:]: elements = line.split('\t') acc = elements[idx] accessions[acc] = None accStr = "','".join(map(str, accessions.keys())) print len(accessions.keys()) data = queryDevice.queryDevice("""SELECT protein_accession, %s FROM target_class tc JOIN target_dictionary td ON td.tid = tc.tid WHERE protein_accession IN('%s')"""%(level, accStr), release, user, pword, host, port) for tup in data: acc = tup[0] targetClass = tup[1] accessions[acc] = targetClass for line in lines[1:]: elements = line.split('\t') acc = elements[idx] targetClass = accessions[acc] out.write("%s\t%s\n"%(line.rstrip('\n'), targetClass )) out.close() os.system('mv %s %s'% ('_'.join([path,"sed"]), path))
def conflicts4Sam(): import queryDevice conflicts = queryDevice.queryDevice("SELECT DISTINCT mpf.molregno, mpf.domain, dcs.pubmed_id, con.conflict, mpf.activity_id FROM map_pfam mpf JOIN activities act ON mpf.activity_id = act.activity_id JOIN docs dcs ON dcs.doc_id = act.doc_id JOIN conflicts con ON mpf.protein_accession = con.protein_accession WHERE mapType = 'conflict'", release) confLkp = {} for conflict in conflicts: confStr = conflict[3] confLkp[confStr] = 0 for confStr in confLkp.keys(): out = open('data/forSam_%s.tab'%confStr, 'w') out.write('molregno\tpubmed\tprediction\tactivity_id\n') for conflict in conflicts: if confStr == conflict[3]: molregno = conflict[0] pubmed = conflict[2] domain = conflict[1] actId = conflict[4] out.write('%s\t%s\t%s\t%s\n'%(molregno, pubmed, domain, actId)) out.close() return
def fullSeq(path, params): import pickle import queryDevice import needle import random inFile = open( path, 'r') lines = inFile.readlines() inFile.close() out = open(path ,'w') out.write("%s\tseq_id\tseq_sim\n"%lines[0].rstrip('\n')) seqIdDict = {} for line in lines[1:]: elements = line.split('\t') proteinAcc_1 = elements[0] proteinAcc_2 = elements[1] pairName = ('_').join([proteinAcc_1, proteinAcc_2]) try: (seqSim, seqId) = seqIdDict[pairName] out.write("%s\t%s\t%s\n"%(line.rstrip('\n'), seqId, seqSim)) continue except KeyError: #print "aligning sequences of: %s\t%s"%(proteinAcc_1, proteinAcc_2) pass data = queryDevice.queryDevice("SELECT td.protein_sequence, td.protein_accession FROM target_dictionary td WHERE td.protein_accession IN ('%s')"% "','".join([proteinAcc_1, proteinAcc_2]), params) lkp = {} for entry in data: lkp[entry[1]] = entry[0] try: seq_1 = lkp[proteinAcc_1] seq_2 = lkp[proteinAcc_2] except KeyError: seqIdDict[pairName] = (None, None) out.write("%s\t%s\t%s\n"%(line.rstrip('\n'), None, None)) continue ################################################ # Align the sequences using needle from EMBOSS. needleReport = needle.needle(params['needlepath'], seq_1, seq_2) # Parse the output of the alignment (seqSim, seqId) = needle.parseNeedle(needleReport) seqIdDict[pairName] = (seqSim, seqId) out.write("%s\t%s\t%s\n"%(line.rstrip('\n'), seqId, seqSim)) out.close()
def add_species(key, path): '''Add species annotation to each row in a table using a Uniprot accession for release < ChEMBL 15. Inputs: key: the keyword in the header indicating a Uniprot Id path: path to the file. -------------------- Felix Kruger [email protected] ''' infile = open(path, 'r') lines = infile.readlines() infile.close() out = open('_'.join([path, "sed"]), 'w') out.write('%s\tspecies_%s\n' % (lines[0].rstrip('\n'), key)) header = lines[0].split('\t') accessions = {} for i, col in enumerate(header): if col == key: idx = i break for line in lines[1:]: elements = line.split('\t') acc = elements[idx] accessions[acc] = 0 accStr = "','".join(map(str, accessions.keys())) data = queryDevice.queryDevice( """SELECT protein_accession, organism FROM target_dictionary WHERE protein_accession IN('%s') """ % accStr, release, user, pword, host, port) for tup in data: acc = tup[0] org = tup[1] accessions[acc] = org for line in lines[1:]: elements = line.split('\t') acc = elements[idx] org = accessions[acc] out.write("%s\t\"%s\"\n" % (line.rstrip('\n'), org)) out.close() os.system('mv %s %s' % ('_'.join([path, "sed"]), path))
def orthologs(): import queryDevice import mkDict import queries import writePairs import os import align import addProperties import mkHomologTable import readIds import yaml # Read config file. paramFile = open('gla.yaml') params = yaml.safe_load(paramFile) needlepath = params['needlepath'] vsCompara = params['vsCompara'] release = params['release'] comparaOrthologs = "data/orthologs_%s.txt"% params['vsCompara'] comparaHumanIds = "data/humanIds_%s.txt"% params['vsCompara'] comparaRatIds = "data/ratIds_%s.txt"% params['vsCompara'] # Assign output filenames. dictFile = "data/ortho_compDict_%s.pkl"% params['release'] results = "data/orthologs_%s_%s.tab"%(params['release'], params['vsCompara']) orthoTab = "data/orthologTable_%s.txt" % params['vsCompara'] # Create output files. humanLkp = readIds.readIds(comparaHumanIds) ratLkp = readIds.readIds(comparaRatIds) mkHomologTable.homologTable(comparaOrthologs, orthoTab, humanLkp, ratLkp) query = queries.paralogs(orthoTab) acts= queryDevice.queryDevice(query, params) mkDict.activities(acts, dictFile) writePairs.homologMedian(params['homologyTypeOrthologs'], orthoTab, dictFile, results) # Annotate output files. align.pfam_a(results, params) align.bSite(results, params) addProperties.addMolweight('molregno', results, params) addProperties.addTargetClass("L1","accession1", results, params) addProperties.addPrefName("accession1", results, params) addProperties.addPrefName("accession2", results, params)
def drugs(pfamDict, release, user, pword, host, port): import time infile = open('data/map_pfam.txt','r') lines = infile.readlines() out = open('data/drugExamples.tab', 'w') out.write('ingredient\tpfam\tuniprot\tnPfam\n') for line in lines[1:]: elements = line.split('\t') molregno = elements[2] pfam = elements[1] uniprot = elements[3] nPfam = len(pfamDict[uniprot]['domains']) time.sleep(0.03) ingredient = queryDevice.queryDevice("SELECT ingredient FROM formulations WHERE molregno = %s" %molregno, release, user, pword, host, port) try: ingredient = ingredient[0][0] out.write('%s\t%s\t%s\t%s\n'%(ingredient, pfam, uniprot, nPfam)) except IndexError: pass out.close()
def exportProps(selected, propDict, threshold, release, user, pword, host, port): import os import queryDevice ### Write output to a table. out = open('data/cmpdPropssed.tab', 'w') out.write('domain\tmolregno\tmolweight\tlogP\tHBA\tHBD\tPSA\trtb\tacd_most_apka\tacd_most_bpka\n') for domain in selected: lkp = {} for mol in propDict[domain]: molregno = mol[0] lkp[molregno] = 0 for molregno in lkp.keys(): tup = queryDevice.queryDevice("SELECT DISTINCT cp.molregno, mw_freebase, alogp, HBA, HBD, PSA, RTB, ACD_MOST_APKA, ACD_MOST_BPKA FROM compound_properties cp JOIN molecule_dictionary md ON cp.molregno = md.molregno WHERE cp.molregno ='%s' AND md.molecule_type = 'Small molecule'"% molregno, release, user, pword, host, port) try: tup = tup[0] out.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n'%(domain, tup[0], tup[1], tup[2], tup[3], tup[4], tup[5], tup[6], tup[7],tup[8])) except IndexError: pass out.close() os.system("sed \"s/None/NA/g\" data/cmpdPropssed.tab > \ data/cmpdProps_pKi%s_chembl%s.tab" %(int(threshold), release))
def get_doms(el_targets): """Find multi-domain architectures. Inputs: el_targets -- list of eligible targets """ pfam_lkp = {} tids = [x[0] for x in el_targets] tidstr = "', '".join(str(t) for t in tids) data = queryDevice.queryDevice(""" SELECT tid, domain_name FROM target_components tc JOIN component_domains cd ON cd.component_id = tc.component_id JOIN domains d ON d.domain_id = cd.domain_id WHERE tc.tid IN('%s')""" %tidstr, RELEASE, USER, PWORD, HOST, PORT) for ent in data: tid = ent[0] dom = ent[1] try: pfam_lkp[tid].append(dom) except KeyError: pfam_lkp[tid] = [dom] return pfam_lkp
def drugs(pfamDict, release, user, pword, host, port): import time infile = open('data/map_pfam.txt', 'r') lines = infile.readlines() out = open('data/drugExamples.tab', 'w') out.write('ingredient\tpfam\tuniprot\tnPfam\n') for line in lines[1:]: elements = line.split('\t') molregno = elements[2] pfam = elements[1] uniprot = elements[3] nPfam = len(pfamDict[uniprot]['domains']) time.sleep(0.03) ingredient = queryDevice.queryDevice( "SELECT ingredient FROM formulations WHERE molregno = %s" % molregno, release, user, pword, host, port) try: ingredient = ingredient[0][0] out.write('%s\t%s\t%s\t%s\n' % (ingredient, pfam, uniprot, nPfam)) except IndexError: pass out.close()
def analysis(release): #### #### Load parameters. #### import yaml # Read config file. paramFile = open('mpf.yaml') params = yaml.safe_load(paramFile) user = params['user'] pword = params['pword'] host = params['host'] port = params['port'] th = params['threshold'] #### #### Load data. #### ## Set threshold for all calculations. import numpy as np threshold = -np.log10(th * 10**(-6)) ## Get all ChEMBL targets with a Uniprot accession. import getUniprotTargets chemblTargets = getUniprotTargets.getUniprotTargets( release, user, pword, host, port) ## Get a list of all human (!) ChEMBL targets humChembl = {} for target in chemblTargets.keys(): if chemblTargets[target] == 'H**o sapiens': humChembl[target] = 0 ## Read all human protein coding genes import parse humProtCod = parse.parse2col('data/proteinCoding.tab', True, 1, 0) #humanTargets = humanProtCodUniq.keys() print "We are dealing with %s human proteins" % len(humProtCod.keys()) ## Load the pfamDict. import pickle inFile = open('data/protCodPfamDict_%s.pkl' % release, 'r') pfamDict = pickle.load(inFile) inFile.close() ## Load the pdbDict. import pickle infile = open('data/pdbDict_%s.pkl' % release, 'r') pdbDict = pickle.load(infile) infile.close() ## Load the uniprotDict. import pickle infile = open('data/bsDictUniprot_%s.pkl' % release, 'r') uniprotDict = pickle.load(infile) infile.close() print 'number of targets with binding site information', len( uniprotDict.keys()) ## Load the uniDict. import parseUniChem uniDict = parseUniChem.parse('data/unichemMappings.txt') ## Load the propDict. import pickle infile = open('data/propDict_%s.pkl' % release, 'r') propDict = pickle.load(infile) infile.close() #### #### Generate Plots. #### ## For each target in PfamDict, calculate the ratio of domain over non-domain regions. import getRatioUnstruct import writeTable import os pfamDict = getRatioUnstruct.getRatio(pfamDict, humProtCod, release, user, pword, host, port) writeTable.writePfam(pfamDict, humProtCod, humChembl, chemblTargets, release) os.system( '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s plotPfamStat.R' % release) ## Assess small molecule binding within Pfam domains for PDBe entries. import matchData import evaluatePred pdbDict = matchData.pdbe(pdbDict, pfamDict, release) evaluatePred.pdbe(pdbDict, 'within', release) os.system( '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s ecdf.R' % ('within', "PDB", release)) ## Assess small molecule binding within Pfam domains for Uniprot entries. import matchData import evaluatePred uniprotDict = matchData.uniprot(uniprotDict, pfamDict, release) evaluatePred.uniprot(uniprotDict, 'within', release) os.system( '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s ecdf.R' % ('within', "Uni", release)) ## Print a summary of the number of targets and domains covered by the mapping. import groupSize import os allDomains = groupSize.uniqueDomains(pfamDict) singleDomains = groupSize.singles(chemblTargets, pfamDict) groupsAll = groupSize.groupSize(chemblTargets, pfamDict, singles) print "all possible groups (single, none, multi, conflict):", groupsAll (single, multi, conflict) = groupSize.groupSizeMap(chemblTargets, release, user, pword, host, port) print "all covered targets (single, multi, conflict): ", len(single), len( multi), len(conflict) (single, multi, conflict) = groupSize.actSizeMap(chemblTargets, release, user, pword, host, port) print "all covered targets (single, multi, conflict): ", len(single), len( multi), len(conflict) ## Plot the evaluation of the mappings. import queryDevice import matchData import evaluatePred import os intacts = queryDevice.queryDevice( """SELECT mpf.protein_accession, mpf.domain,mpf.molregno, pfd.start, pfd.end, mpf.maptype, md.chembl_id FROM map_pfam mpf JOIN pfam_domains pfd ON pfd.protein_accession = mpf.protein_accession JOIN molecule_dictionary md ON md.molregno = mpf.molregno WHERE mpf.domain = pfd.domain""", release, user, pword, host, port) # ...against PDBe pdbDict = matchData.pdbePredicted(pdbDict, intacts, uniDict) evaluatePred.pdbePredicted(pdbDict, 'prediction', release) os.system( '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s ecdf.R' % ('prediction', 'PDB', release)) # ...against uniprot uniprotDict = matchData.uniprotPredicted(uniprotDict, intacts) evaluatePred.uniprotPredicted(uniprotDict, 'prediction', release) os.system( '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s -%s -%s ecdf.R' % ('prediction', "Uni", release)) ## Map the overlap #import overlap #tholds = [50,10,5,1,0.5,0.1,0.05,0.01,0.005,0.001, 0.0005,0.0001, 0.00005,0.000001] #overlap.overlap(propDict, tholds, release) ## Power Law Distribution of domain occurences ## Prepare the data for the power law plot. ## 1. Count the targets and compounds per domain using the propDict ## 2. Count a human genes per domain using the Pfam dictionary ## 3. Plot the power law distributions for all domains and overlay 25 most ## frequent domains import countFreqs import plplot import plplotRaw import parse countFreqs.countLigs(humProtCod.keys(), chemblTargets, release, user, pword, host, port) countFreqs.countDoms(humProtCod.keys(), pfamDict) filenames = ['genFreq.tab', 'domLigs.tab', 'targLigs.tab'] for filename in filenames: os.system( '/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s statPowerLaw.R' % filename) al, minx = parse.rdstatLogs('data/powerLawLog%s' % filename) freqs = parse.col2intlist('data/%s' % filename, 1, True) print len(freqs), minx, al, filename, type(freqs), type(freqs[1]) plplot.plplot(freqs, minx, al, filename) plplotRaw.plplotRaw(freqs, filename) ## Plot the ligand properties. import export import os selected = ['Pkinase', 'Pkinase_Tyr', 'p450', 'SNF', 'Trypsin', 'RVP'] export.exportProps(selected, propDict, threshold, release, user, pword, host, port) filename = 'data/cmpdProps_pKi%s_chembl%s.tab' % (int(threshold), release) os.system( "/ebi/research/software/Linux_x86_64/bin/R-2.11.0 CMD BATCH --vanilla -%s pca.R" % filename)