def insertSimcomp(): dirpath = '/home/tor/robotics/prj/csipb-jamu-prj/dataset/kegg/kegg_20161010/simcomp' # SELECT com_id FROM `compound` where com_kegg_id!='' q = 'SELECT com_id,com_kegg_id FROM `compound` where com_kegg_id!=""' resp = util.mysqlCommit(db, cursor, q, True) kegg2ComIdMap = {} for i in resp: kegg2ComIdMap[i[1]] = i[0] # for filename in os.listdir(dirpath): if filename.endswith(".sim"): keggId = filename.split('_')[1].strip('.sim').strip() fpath = os.path.join(dirpath, filename) simcomp = [] with open(fpath) as infile: for line in infile: words = line.split('=') words = [i.strip() for i in words] keggId2 = words[0] if keggId2 in kegg2ComIdMap.keys(): comId = kegg2ComIdMap[keggId2] score = words[1] simcomp.append(comId + ':' + keggId2 + '=' + score) if len(simcomp) != 0: simcompStr = '\n'.join(simcomp) simcompStr = '"' + simcompStr + '"' qf = 'UPDATE compound SET com_simcomp=' + simcompStr qr = ' WHERE com_kegg_id=' + '"' + keggId + '"' q = qf + qr util.mysqlCommit(db, cursor, q)
def main(): # TODO get comId and proId from DB maxComId = 1000 #21994 comList = range(1,maxComId+1) comList = ['COM'+str(i).zfill(8) for i in comList] maxProId = 3334 proList = range(1,maxProId+1) proList = ['PRO'+str(i).zfill(8) for i in proList] # blm = BLM() # nCom = len(comList) nPro = len(proList) for i,c in enumerate(comList): for j,p in enumerate(proList): msg1 = 'search/predict for c= '+c+' ('+str(i+1)+'/'+str(nCom)+') ' msg2 = ' vs p= '+p+' ('+str(j+1)+'/'+str(nPro)+') ' msg = msg1+msg2 print msg qf = 'SELECT source,timestamp FROM compound_vs_protein WHERE' qr = ' com_id='+'"'+c+'"'+' AND pro_id='+'"'+p+'"' q = qf+qr resp = util.mysqlCommit(db,cursor,q) needToPredict = True if resp!=None: src,stamp = resp if 'predictor' not in src:# then it is a fact needToPredict = False if needToPredict: print 'predicting ...' # (re)predict # weight here is the confidence level that the link exists weight, srcs = blm.predict(c,p) # insert if weight > 0.0: print 'inserting the prediction ...' insertVals = [c,p,str(weight),srcs] insertVals = ['"'+j+'"' for j in insertVals] qf = '''INSERT INTO compound_vs_protein (com_id,pro_id, weight,source) VALUES (''' qm = ','.join(insertVals) qr = ')' q = qf+qm+qr util.mysqlCommit(db,cursor,q)
def insertDrugVsProtein(drugProteinDict): idx = 0 log = [] logFpath = outDir + '/insertDrugVsProtein.log' src = 'drugbank.ca' weight = '1.0' for i, v in drugProteinDict.iteritems(): idx += 1 qf = 'SELECT com_id FROM compound WHERE com_drugbank_id =' qm = '"' + i + '"' qr = '' sql = qf + qm + qr comIdR = util.mysqlCommit(db, cursor, sql) pList = list(set(v['uniprotTargets'])) for p in pList: msg = 'inserting ' + i + ' vs ' + p + ' idx= ' + str( idx) + ' of ' + str(len(drugProteinDict)) print msg qf = 'SELECT pro_id FROM protein WHERE pro_uniprot_id =' qm = '"' + p + '"' qr = '' sql = qf + qm + qr proIdR = util.mysqlCommit(db, cursor, sql) if comIdR != None and proIdR != None: proId = proIdR[0] comId = comIdR[0] insertVals = [comId, proId, weight, src] insertVals = ['"' + j + '"' for j in insertVals] qf = '''INSERT INTO compound_vs_protein (com_id,pro_id, weight,source) VALUES (''' qm = ','.join(insertVals) qr = ')' sql = qf + qm + qr util.mysqlCommit(db, cursor, sql) else: msg = 'FAIL: ' + msg print msg log.append(msg) with open(logFpath, 'w') as f: for l in log: f.write(str(l) + '\n')
def insertComFromDrugbank(comIdx): drugData = None fpath = '/home/tor/robotics/prj/csipb-jamu-prj/dataset/drugbank/drugbank_20161002/drugbank_drug_data_2016-10-05_10:16:42.860649.pkl' with open(fpath, 'rb') as handle: drugData = pickle.load(handle) for i, v in drugData.iteritems(): if len(v['uniprotTargets']) == 0: continue comIdx += 1 print 'inserting', i, 'comIdx=', str(comIdx), 'of at most', str( len(drugData)) comId = str(comIdx) comId = comId.zfill(8) comId = 'COM' + comId comDrugbankId = i na = '' insertVals = [] insertVals.append(comId) insertVals.append(comDrugbankId) insertKeys = [ 'CAS number', 'pubchemCid', 'InChI Key', 'SMILES', 'com_knapsack_id', 'com_kegg_id', 'com_simcomp' ] for k in insertKeys: if k in v.keys(): insertVals.append(v[k]) else: insertVals.append(na) insertVals = ['' if len(i) == 0 else i for i in insertVals] insertVals = ['' if 'not-available' in i else i for i in insertVals] insertVals = ['"' + iv + '"' for iv in insertVals] assert len(insertVals) == 9 qf = '''INSERT INTO compound (com_id, com_drugbank_id, com_cas_id, com_pubchem_id, com_inchikey, com_smiles, com_knapsack_id, com_kegg_id, com_simcomp) VALUES (''' qm = ','.join(insertVals) qr = ')' sql = qf + qm + qr util.mysqlCommit(db, cursor, sql) return comIdx
def insertComFromKnapsack(comIdx): plantCompoundDict = None fpath = '/home/tor/robotics/prj/csipb-jamu-prj/dataset/knapsack/20161003/knapsack_jsp_plant_vs_compound_2016-10-04_16:34:06.468234.pkl' with open(fpath, 'rb') as handle: plantCompoundDict = pickle.load(handle) compoundDict = {} for comList in plantCompoundDict.values(): for kId, cas, name, form in comList: compoundDict[kId] = cas idx = 0 nCom = len(compoundDict) matchList = [] for k, cas in compoundDict.iteritems(): idx += 1 print 'insert/updating', k, 'idx=', str(idx), 'of', nCom if util.mysqlExist(db, cursor, table='compound', where='com_cas_id=' + '"' + cas + '"'): # print 'match' matchList.append(cas) qf = 'UPDATE compound SET com_knapsack_id=' qm = '"' + k + '"' qr = 'WHERE com_cas_id=' + '"' + cas + '"' q = qf + qm + qr util.mysqlCommit(db, cursor, q) else: comIdx += 1 comIdStr = 'COM' + str(comIdx).zfill(8) insertVals = [comIdStr, cas, k] insertVals = ['"' + i + '"' for i in insertVals] qf = 'INSERT INTO compound (com_id,com_cas_id,com_knapsack_id) VALUES (' qm = ','.join(insertVals) qr = ')' q = qf + qm + qr util.mysqlCommit(db, cursor, q) fpath = outDir + '/knapsack_compound_match_with_drugbank.lst' with open(fpath, 'w') as f: for m in matchList: f.write(str(m) + '\n') return comIdx
def insertPlants(plantList): nPlant = len(plantList) for idx, p in enumerate(plantList): plaId = str(idx+1) plaId = plaId.zfill(8) plaId = '"'+'PLA'+plaId+'"' print 'inserting ', plaId, 'of', str(nPlant) plaName = '"'+p+'"' qf = 'INSERT INTO plant (pla_id,pla_name) VALUES (' qm = plaId+','+plaName qr = ')' q = qf+qm+qr util.mysqlCommit(q)
def insertProteinVsDisease(proteinDiseaseDict): src = 'uniprot.org' n = len(proteinDiseaseDict) log = [] logFpath = dirPath + '/insertProteinVsDisease.log' idx = 0 for p, v in proteinDiseaseDict.iteritems(): idx += 1 qf = 'SELECT pro_id FROM protein WHERE pro_uniprot_abbrv =' qm = '"' + p + '"' qr = '' q = qf + qm + qr proIdR = util.mysqlCommit(db, cursor, q) for d in v['disease']: msg = 'inserting ' + p + ' vs ' + d[1] + 'idx= ' + str( idx) + ' of ' + str(n) print msg qf = 'SELECT dis_id FROM disease WHERE dis_omim_id =' qm = '"' + d[2] + '"' qr = '' q = qf + qm + qr disIdR = util.mysqlCommit(db, cursor, q) if proIdR != None and disIdR != None: proId = proIdR[0] disId = disIdR[0] insertVals = [proId, disId, src] insertVals = ['"' + i + '"' for i in insertVals] qf = 'INSERT INTO protein_vs_disease (pro_id,dis_id,pro_vs_dis_source) VALUES (' qm = ','.join(insertVals) qr = ')' q = qf + qm + qr util.mysqlCommit(db, cursor, q) else: log.append('FAIL: ' + msg) with open(logFpath, 'w') as f: for i in log: f.write(str(i) + '\n')
def insertPlantVsCompound(plantCompoundDict): pc = plantCompoundDict src = 'knapsack.kanaya.naist.jp' log = []; logFpath = outDir+'/insertPlantVsCompound.log' n = len(pc); idx = 0; for p,v in pc.iteritems(): idx += 1 qf = 'SELECT pla_id FROM plant WHERE pla_name =' qm = '"'+p+'"' qr = '' q = qf+qm+qr plaIdR = util.mysqlCommit(db,cursor,q); comList = list( set([c[0] for c in v]) ) for c in comList: msg = 'inserting '+ p+ ' vs '+ c+ ' idx= '+ str(idx)+ ' of '+ str(n) print msg qf = 'SELECT com_id FROM compound WHERE com_knapsack_id =' qm = '"' + c + '"' qr = '' q = qf+qm+qr comIdR = util.mysqlCommit(db,cursor,q); if plaIdR!=None and comIdR!=None: plaId = plaIdR[0] comId = comIdR[0] insertVals = [plaId,comId,src] insertVals = ['"'+i+'"' for i in insertVals] qf = 'INSERT INTO plant_vs_compound (pla_id,com_id,source) VALUES (' qm = ','.join(insertVals) qr = ')' q = qf+qm+qr util.mysqlCommit(db,cursor,q) else: log.append('FAIL: '+msg) with open(logFpath,'w') as f: for i in log: f.write(str(i)+'\n')
def insertComFromKegg(comIdx): comDataDpath = '/home/tor/robotics/prj/csipb-jamu-prj/dataset/kegg/kegg_20161010/keggCom_20161010_1-100K' drugDataFpath = '/home/tor/robotics/prj/csipb-jamu-prj/dataset/kegg/kegg_20161010/keggdrug_data_2016-10-11_16:58:04.683546.pkl' # Load Kegg compound data data = {} for filename in os.listdir(comDataDpath): if filename.endswith(".pkl"): fpath = os.path.join(comDataDpath, filename) dPerFile = {} with open(fpath, 'rb') as handle: dPerFile = pickle.load(handle) for k, v in dPerFile.iteritems(): if len(v) != 0: data[k] = v sortedK = data.keys() sortedK.sort() fpath = outDir + '/keggComData_validComId.lst' with open(fpath, 'w') as f: for k in sortedK: f.write(str(k) + '\n') # Load Kegg drug data, to infer their drugbank equivalent drugData = None with open(drugDataFpath, 'rb') as handle: drugData = pickle.load(handle) # Update or Insert insertList = [] n = len(data) idx = 0 for keggId, d in data.iteritems(): idx += 1 print 'insert/update keggId=', keggId, 'idx=', idx, 'of', n knapsackId = '' if 'knapsackId' in d.keys(): knapsackId = d['knapsackId'] drugbankId = '' if 'keggDrugId' in d.keys(): _ = drugData[d['keggDrugId']]['drugbankId'] if len(_) != 0: drugbankId = _ casId = '' if 'casId' in d.keys(): casId = d['casId'] insert = False if knapsackId != '': # check if exist if util.mysqlExist(db, cursor, table='compound', where='com_knapsack_id=' + '"' + knapsackId + '"'): # update based on knapsackID qf = 'UPDATE compound ' qm = 'SET ' + 'com_kegg_id=' + '"' + keggId + '"' if casId != '': qm = qm + ',' + ' com_cas_id=' + '"' + casId + '"' if drugbankId != '': qm = qm + ',' + ' com_drugbank_id=' + '"' + drugbankId + '"' qr = ' WHERE com_knapsack_id=' + '"' + knapsackId + '"' q = qf + qm + qr resp = util.mysqlCommit(db, cursor, q) else: insert = True if drugbankId != '': if util.mysqlExist(db, cursor, table='compound', where='com_drugbank_id=' + '"' + drugbankId + '"'): # update based on drugbankId qf = 'UPDATE compound ' qm = 'SET ' + 'com_kegg_id=' + '"' + keggId + '"' if casId != '': qm = qm + ',' + ' com_cas_id=' + '"' + casId + '"' if knapsackId != '': qm = qm + ',' + ' com_knapsack_id=' + '"' + knapsackId + '"' qr = ' WHERE com_drugbank_id=' + '"' + drugbankId + '"' q = qf + qm + qr resp = util.mysqlCommit(db, cursor, q) else: insert = True if insert: comIdx += 1 comIdStr = 'COM' + str(comIdx).zfill(8) insertVals = [comIdStr, casId, drugbankId, knapsackId, keggId] insertVals = ['"' + i + '"' for i in insertVals] qf = 'INSERT INTO compound (com_id,com_cas_id,com_drugbank_id,com_knapsack_id,com_kegg_id)' qr = ' VALUES (' + ','.join(insertVals) + ')' q = qf + qr resp = util.mysqlCommit(db, cursor, q) insertList.append(q) insertListFpath = outDir + '/insertion_from_keggComData.lst' with open(insertListFpath, 'w') as f: for l in insertList: f.write(str(l) + '\n') return comIdx