def Ttest(self, pAC50All, presult): dAC50 = toolbox.loadMatrix(pAC50All, sep="\t") ddesc = toolbox.loadMatrix(self.pdesc1D2Dclean, sep=",") print ddesc.keys()[:20] print dAC50.keys()[:20] runExternalSoft.TtestDesc(self.pdesc1D2Dclean, pAC50All, presult)
def computeOpera(self, update): if "opera" in self.__dict__: return 1 else: # check if descriptors already computed pdes = self.prDesc + self.name + ".txt" if path.exists(pdes) and path.getsize(pdes) > 10 and update == 0: filin = open(pdes, "r") llines = filin.readlines() filin.close() ldesc = llines[0].strip().split("\t")[1:] lval = llines[1].strip().split("\t")[1:] ddes = {} i = 0 while i < len(ldesc): ddes[ldesc[i]] = lval[i] i += 1 self.allDesc = ddes self.log = self.log + "Desc already computed -> " + pdes + "\n" return 0 dopera = {} prOPERA = pathFolder.createFolder(self.prDesc + "OPERA/" + self.name + "/") molH = Chem.AddHs(self.mol) psdf = prOPERA + str(self.name) + ".sdf" filsdf = open(psdf, "w") filsdf.write(Chem.MolToMolBlock(molH)) filsdf.close() pdesc2D = runExternalSoft.runPadel(prOPERA) ddesc2D = toolbox.loadMatrix(pdesc2D, sep=",") transformOPERAList(ddesc2D) for desc2D in ddesc2D[ddesc2D.keys()[0]].keys(): if desc2D in LOPERA: dopera[desc2D] = ddesc2D[ddesc2D.keys()[0]][desc2D] lpdesc = runExternalSoft.runOPERA(psdf, pdesc2D, prOPERA) for pdesc in lpdesc: try: ddesc = toolbox.loadMatrix(pdesc, ",") except: print pdesc dddd for desc in ddesc[ddesc.keys()[0]].keys(): if desc in LOPERA: dopera[desc] = ddesc[ddesc.keys()[0]][desc] self.opera = dopera self.allDesc.update(deepcopy(self.opera))
def parsepdf(prcytox, prresult): ppdf = prcytox + "toxsci-15-0719-File012.pdf" ptable = prcytox + "toxsci-15-0719-File009.csv" pfilout = prresult + "cytox.csv" if path.exists(pfilout): dout = toolbox.loadMatrix(pfilout, sep="\t") return dout dtable = toolbox.loadMatrix(ptable, sep=",") lCASID = [] for chem in dtable.keys(): lCASID.append(dtable[chem]["CASRN"]) dout = {} fpdf = open(ppdf, "rb") pdfReader = PdfFileReader(fpdf) nbpage = pdfReader.getNumPages() #nbpage = 32 i = 0 while i < nbpage: pageObj = pdfReader.getPage(i) pageText = pageObj.extractText() llines = pageText.split("\n") for line in llines: for CASID in lCASID: if search(CASID, line): dout[CASID] = {} CAStemp = CASID break if search("cytotox min=", line): cytoxmin = findall(r"[-+]?\d*\.\d+|\d+", line) #print cytoxmin dout[CAStemp]["CytoxMin"] = cytoxmin[0] if search("cytotox median=", line): #print line cytoxMed = findall(r"[-+]?\d*\.\d+|\d+", line) #print cytoxMed dout[CAStemp]["CytoxMedian"] = cytoxMed[3] i += 1 filout = open(pfilout, "w") filout.write("CAS\tCytoxMin\tCytoxMedian\n") for CASID in dout.keys(): filout.write( str(CASID) + "\t" + str(dout[CASID]["CytoxMin"]) + "\t" + str(dout[CASID]["CytoxMedian"]) + "\n") filout.close() return dout
def mergeDescInvolve(prin, ML, nbdesc, prout): dimportance = {} lprrun = listdir(prin) for prrun in lprrun: if prrun == "Average" or prrun == "descImportance" or prrun == "Prob": continue lprcell = listdir(prin + "/" + prrun + "/") for prcell in lprcell: if not prcell in dimportance.keys(): dimportance[prcell] = {} dimportance[prcell][prrun] = {} pimportance = prin + prrun + "/" + prcell + "/" + str( ML) + "class/ImportanceDesc" if path.exists(pimportance): ddescimportance = toolbox.loadMatrix(pimportance, sep="\t") dimportance[prcell][prrun] = ddescimportance else: pmodel = prin + prrun + "/" + prcell + "/" + str( ML) + "class/model.RData" ptrain = prin + prrun + "/" + prcell + "/trainSet.csv" if path.exists(pmodel): runExternalSoft.createImportanceTable( pmodel, ML, ptrain, prin + prrun + "/" + prcell + "/" + str(ML) + "class/") ddescimportance = toolbox.loadMatrix(pimportance, sep="\t") dimportance[prcell][prrun] = ddescimportance # write global table for typeAssay in dimportance.keys(): pdesc = prout + "Importance" + str(ML) + "_" + typeAssay fdesc = open(pdesc, "w") lrun = dimportance[typeAssay].keys() ldesc = dimportance[typeAssay][lrun[0]].keys() fdesc.write("Desc\tRun\tval\n") for desc in ldesc: for run in lrun: try: fdesc.write(desc + "\t" + str(run) + "\t" + str(dimportance[typeAssay][run][desc]["x"]) + "\n") except: fdesc.write(desc + "\t" + str(run) + "\t0.0\n") fdesc.close() runExternalSoft.runImportanceDesc(pdesc, nbdesc) return 0
def parseCuratedDataset(self, pr_in): p_dataset_MIC = pr_in + "MIC-curated_mol.csv" # control the file with dataset excite if not path.exists(p_dataset_MIC): return 1 else: self.pMIC = p_dataset_MIC # load in tableorgafull l_dchem = toolbox.tableTolist(p_dataset_MIC) l_orga = l_dchem[0].keys() l_orga.remove("CMPD_CHEMBLID") l_orga.remove("SMILES") dout = {} for orga in l_orga: # load chem by orga p_orga = pr_in + orga + ".csv" dorga = toolbox.loadMatrix(p_orga) dout[orga] = dorga for dchem in l_dchem: chemID = dchem["CMPD_CHEMBLID"] dout[orga][chemID]["pMIC"] = dchem[orga] dout[orga][chemID]["SMILES"] = dchem["SMILES"] self.tableorgafull = dout
def formatPubChemTable(pfilin, PRPUBCHEM, prout, update=0): pfilout = prout + "tableSmi.csv" if path.exists(pfilout) and update == 0: return pfilout else: filout = open(pfilout, "w") filout.write("ID\tSMILES\tActive\n") dchem = toolbox.loadMatrix(pfilin, sep=",") #print dchem.keys() for chemID in dchem.keys(): cpubmed = Chem(chemID, PRPUBCHEM) SMILES = cpubmed.getSMILE() if search("Error", SMILES): continue if chemID == "RESULT_IS_ACTIVE_CONCENTRATION" or chemID == "RESULT_UNIT": continue #print SMILES # add filter #print dchem[chemID]["Log of MaxDyeEquivalency"], dchem[chemID]["PUBCHEM_ACTIVITY_OUTCOME"] if "Log of MaxDyeEquivalency" in dchem[chemID].keys(): if float(dchem[chemID]["Log of MaxDyeEquivalency"]) < -7: dchem[chemID]["PUBCHEM_ACTIVITY_OUTCOME"] = "Inactive" filout.write( "%s\t%s\t%s\n" % (chemID, SMILES, dchem[chemID]["PUBCHEM_ACTIVITY_OUTCOME"])) filout.close() return pfilout
def validationPredictor(self, typeCellChannel, pAC50All): dAC50All = toolbox.loadMatrix(pAC50All) dCASact = {} dpredict = {} dCASact[typeCellChannel] = [] for CASID in dAC50All.keys(): # have to change if dAC50All[CASID][typeCellChannel] != "NA": dCASact[typeCellChannel].append(CASID) if not CASID in dpredict.keys(): if not path.exists(self.cDB.prSMIclean + CASID + ".smi"): continue else: smiles = toolbox.loadSMILES(self.cDB.prSMIclean + CASID + ".smi") dpredict[CASID] = self.predictSMI(CASID, smiles, plot=1) prval = pathFolder.createFolder(self.prout + "validation/" + typeCellChannel + "/") for typeAssay in dCASact.keys(): channel = "_".join(typeAssay.split("_")[1:]) cell = typeAssay.split("_")[0] kpred = str(cell) + "_" + str(channel) ldesc = dpredict[dpredict.keys()[0]][kpred] filout = open(prval + typeCellChannel, "w") filout.write("CASID" + "\t".join(typeCellChannel) + "\n") for CASID in dpredict.keys(): filout.write(CASID) for desc in ldesc: filout.write("\t" + str(dpredict[CASID][kpred][desc])) filout.write("\n") filout.close() return 0
def loadAllOperaDesc(pOperaDesc): dDTX = toolbox.loadMatrix(pOperaDesc, ',') dCAS = {} for DTXID in dDTX.keys(): CASID = dDTX[DTXID]["CASRN"] dCAS[CASID] = {} for desc in chemical.LOPERA: if dDTX[DTXID][desc] == "NaN": dDTX[DTXID][desc] = "NA" dCAS[CASID][desc] = dDTX[DTXID][desc] return dCAS
def get_PNGAndSMI(p_desc, pr_results): # load the ddesc to have SMILES cleanned ddesc = toolbox.loadMatrix(p_desc) pr_smi = pathFolder.createFolder(pr_results + "SMI/") pr_png = pathFolder.createFolder(pr_results + "PNG/") for chemID in ddesc.keys(): print chemID SMILES = ddesc[chemID]["SMILES"] p_fsmi = pr_smi + chemID + ".smi" fsmi = open(p_fsmi, "w") fsmi.write(SMILES) fsmi.close() p_fpng = pr_png + chemID + ".png" runExternalSoft.molconvert(p_fsmi, p_fpng) return
def loadData(self): self.d_MIC = toolbox.loadMatrix(self.p_MIC) self.d_cluster = toolbox.loadMatrix(self.p_cluster, sep=",")
def mergeResults(prin, prout): dresult = {} dperf = {} dperf["Acc"] = [] dperf["Sp"] = [] dperf["Se"] = [] dperf["MCC"] = [] lprrun = listdir(prin) for prrun in lprrun: if prrun == "Average" or prrun == "descImportance": continue lprcell = listdir(prin + "/" + prrun + "/") for prcell in lprcell: pperfCV = prin + "/" + prrun + "/" + prcell + "/perfCV.csv" pperftrain = prin + "/" + prrun + "/" + prcell + "/perfTrain.csv" pperftest = prin + "/" + prrun + "/" + prcell + "/perfTest.csv" try: MCV = toolbox.loadMatrix(pperfCV, sep=",") Mtrain = toolbox.loadMatrix(pperftrain, sep=",") Mtest = toolbox.loadMatrix(pperftest, sep=",") except: continue lML = MCV.keys() lcriteria = dperf.keys() lset = ["CV", "train", "test"] # create the structures if not prcell in dresult.keys(): dresult[prcell] = {} dresult[prcell]["CV"] = {} dresult[prcell]["train"] = {} dresult[prcell]["test"] = {} for ML in lML: dresult[prcell]["CV"][ML] = deepcopy(dperf) dresult[prcell]["train"][ML] = deepcopy(dperf) dresult[prcell]["test"][ML] = deepcopy(dperf) for ML in lML: for criteria in lcriteria: dresult[prcell]["CV"][ML][criteria].append( float(MCV[ML][criteria])) dresult[prcell]["train"][ML][criteria].append( float(Mtrain[ML][criteria])) dresult[prcell]["test"][ML][criteria].append( float(Mtest[ML][criteria])) dout = deepcopy(dresult) for celltype in dresult.keys(): for set in dresult[celltype].keys(): for ML in dresult[celltype][set].keys(): for criteria in dresult[celltype][set][ML].keys(): AV = round(mean(dresult[celltype][set][ML][criteria]), 3) SD = round(std(dresult[celltype][set][ML][criteria]), 3) dout[celltype][set][ML][criteria] = [AV, SD] # write result lperfcriteria = ["Acc", "Sp", "Se", "MCC"] for celltype in dout.keys(): pfilout = prout + celltype + ".csv" filout = open(pfilout, "w") for set in dout[celltype].keys(): filout.write(str(set) + "\n") filout.write("\t" + "\t".join( ["M-" + str(c) + "\t" + "SD-" + str(c) for c in lperfcriteria]) + "\n") for ML in dout[celltype][set].keys(): filout.write(ML) for criteria in lperfcriteria: filout.write("\t" + str(dout[celltype][set][ML][criteria][0]) + "\t" + str(dout[celltype][set][ML][criteria][1])) filout.write("\n") filout.close() return 0
def computeMatrixMCS(self, kID="CMPD_CHEMBLID", kSMILES="CANONICAL_SMILES"): pfiloutTanimoto = self.prout + "tanimoto" pfiloutNBatomMax = self.prout + "maxAtom" if path.exists(pfiloutTanimoto) and path.exists(pfiloutNBatomMax): print "in" dMCSTanimoto = toolbox.loadMatrix(pfiloutTanimoto) dMCSMax = toolbox.loadMatrix(pfiloutNBatomMax) self.MCSTanimoto = dMCSTanimoto self.MCSMax= dMCSMax else: lcmpdID = [self.sdata[i][kID] for i in range(0, len(self.sdata))] i = 0 imax = len(self.sdata) dTanimoto = {} dMaxMCS = {} while i < imax: j = i while j < imax: print i,j if not self.sdata[i][kID] in dTanimoto.keys(): dTanimoto[self.sdata[i][kID]] = {} dMaxMCS[self.sdata[i][kID]] = {} if not self.sdata[j][kID] in dTanimoto[self.sdata[i][kID]].keys(): ltanimoto_max = get_Tanimoto(self.sdata[i][kSMILES], self.sdata[j][kSMILES]) dMaxMCS[self.sdata[i][kID]][self.sdata[j][kID]] = ltanimoto_max[1] dTanimoto[self.sdata[i][kID]][self.sdata[j][kID]] = ltanimoto_max[0] j += 1 i += 1 filoutTanimoto = open(self.prout + "tanimoto", "w") filoutNBatomMax = open(self.prout + "maxAtom", "w") filoutTanimoto.write("\t".join(lcmpdID) + "\n") filoutNBatomMax.write("\t".join(lcmpdID) + "\n") for cmpdID1 in lcmpdID: lwTanimoto = [] lwMax = [] for cmpdID2 in lcmpdID: try: lwTanimoto.append(str(dTanimoto[cmpdID1][cmpdID2])) except: lwTanimoto.append(str(dTanimoto[cmpdID2][cmpdID1])) try: lwMax.append(str(dMaxMCS[cmpdID1][cmpdID2][1])) except: lwMax.append(str(dMaxMCS[cmpdID2][cmpdID1][1])) filoutTanimoto.write(cmpdID1 + "\t" + "\t".join(lwTanimoto) + "\n") filoutNBatomMax.write(cmpdID1 + "\t" + "\t".join(lwMax) + "\n") filoutTanimoto.close() filoutNBatomMax.close() paff = self.prout + "aff" if not "Aff" in dir(self): daff = {} for compound in self.sdata: daff[compound[kID]] = compound["PCHEMBL_VALUE"] self.Aff = daff if not path.exists(paff): filoutaff = open(paff, "w") filoutaff.write("pchem affinity\n") for compound in self.sdata: filoutaff.write(str(compound[kID]) + "\t" + str(compound["PCHEMBL_VALUE"]) + "\n") filoutaff.close() # plot matrix runExternalSoft.MDSMCS(pfiloutTanimoto, paff)
def computeDesc(passay, PRDESC, PRSMI, prout, nbfile=1, update=0): # by pass pdescout = prout + "descMat" paff = prout + "aff.txt" if path.exists(pdescout) and update == 0 and nbfile == 1: return pdescout elif path.exists(pdescout) and update == 0 and nbfile == 2 and path.exists( paff): return [pdescout, paff] dchem = toolbox.loadMatrix(passay) lchemID = dchem.keys() try: lchemID.remove("RESULT_UNIT") except: pass shuffle(lchemID) i = 0 nbi = len(lchemID) while i < nbi: if search("error", dchem[lchemID[i]] ["SMILES"].lower()): # case of the table is computed before del dchem[lchemID[i]] del lchemID[i] nbi = nbi - 1 continue if dchem[lchemID[i]]["Active"] == "Inconclusive" or search( "Error", dchem[lchemID[i]]["SMILES"]): del dchem[lchemID[i]] del lchemID[i] nbi = nbi - 1 continue # compute descriptors cchem = chemical.chemical(lchemID[i], dchem[lchemID[i]]["SMILES"]) cchem.prepareChem(PRSMI) if search("error", cchem.log.lower()): del dchem[lchemID[i]] del lchemID[i] nbi = nbi - 1 continue cchem.compute1D2DDesc(PRDESC) if search("error", cchem.log.lower()): del dchem[lchemID[i]] del lchemID[i] nbi = nbi - 1 continue cchem.computeOpera(update=update) if search("error", cchem.log.lower()): del dchem[lchemID[i]] del lchemID[i] nbi = nbi - 1 i = i - 1 continue cchem.writeTablesDesc(PRDESC, update=update) i = i + 1 if nbfile == 1: fildesc = open(pdescout, "w") ldesc = chemical.getLdesc("1D2D", 1) + chemical.getLdesc("Opera", 0) fildesc.write("ID," + ",".join(ldesc) + ",Aff" + "\n") for chemID in lchemID: print chemID if dchem[chemID]["Active"] == "Active": aff = 1 else: aff = 0 pdesc = PRDESC + chemID + ".txt" if path.exists(pdesc): ddesc = toolbox.loadMatrix(pdesc) lval = [] for desc in ldesc: if not desc in ddesc[chemID].keys(): lval.append("NA") else: lval.append(str(ddesc[chemID][desc])) fildesc.write(chemID + "," + ",".join(lval) + "," + str(aff) + "\n") fildesc.close() return pdescout else: fildesc = open(pdescout, "w") paff = prout + "aff.txt" filaff = open(paff, "w") ldesc = chemical.getLdesc("1D2D", 1) + chemical.getLdesc("Opera", 0) fildesc.write("ID," + ",".join(ldesc) + "\n") filaff.write("ID\tAff\n") for chemID in lchemID: print chemID if dchem[chemID]["Active"] == "Active": aff = 1 else: aff = 0 pdesc = PRDESC + chemID + ".txt" if path.exists(pdesc): ddesc = toolbox.loadMatrix(pdesc) lval = [] for desc in ldesc: if not desc in ddesc[chemID].keys(): lval.append("NA") else: lval.append(str(ddesc[chemID][desc])) fildesc.write(chemID + "," + ",".join(lval) + "," + str(aff) + "\n") filaff.write(chemID + "\t" + str(aff) + "\n") fildesc.close() filaff.close() return [pdescout, paff]
def mergeProba(prin, ML, prout): dprob = {} dreal = {} lprrun = listdir(prin) for prrun in lprrun: if prrun == "Average" or prrun == "descImportance" or prrun == "Prob": continue lprcell = listdir(prin + "/" + prrun + "/") for prcell in lprcell: if not prcell in dreal.keys(): dreal[prcell] = {} flag = 0 for filin in listdir(prin + prrun + "/" + prcell + "/"): if search("AC50_", filin): paff = prin + prrun + "/" + prcell + "/" + filin flag = 1 break daff = toolbox.loadMatrix(paff, sep="\t") dreal[prcell].update(deepcopy(daff)) if not prcell in dprob.keys(): dprob[prcell] = {} dprob[prcell][prrun] = {} pCV = prin + prrun + "/" + prcell + "/" + str( ML) + "class/PerfRFClassCV10.txt" dCV = toolbox.loadMatrix(pCV) dprob[prcell][prrun]["CV"] = dCV ptrain = prin + prrun + "/" + prcell + "/" + str( ML) + "class/classTrain.csv" dtrain = toolbox.loadMatrix(ptrain, sep=",") dprob[prcell][prrun]["train"] = dtrain ptest = prin + prrun + "/" + prcell + "/" + str( ML) + "class/classTest.csv" dtest = toolbox.loadMatrix(ptest, sep=",") dprob[prcell][prrun]["test"] = dtest print dreal[prcell].keys() print len(dreal[prcell].keys()) # write table for probability dw = {} for prcell in dprob.keys(): dw[prcell] = {} dw[prcell] = {} dw[prcell]["train"] = {} dw[prcell]["test"] = {} dw[prcell]["CV"] = {} for run in dprob[prcell].keys(): for IDtrain in dprob[prcell][run]["train"].keys(): if not IDtrain in dw[prcell]["train"].keys(): dw[prcell]["train"][IDtrain] = [] dw[prcell]["train"][IDtrain].append( float(dprob[prcell][run]["train"][IDtrain]["x"])) for IDtest in dprob[prcell][run]["test"]: if not IDtest in dw[prcell]["test"].keys(): dw[prcell]["test"][IDtest] = [] dw[prcell]["test"][IDtest].append( float(dprob[prcell][run]["test"][IDtest]["x"])) for IDCV in dprob[prcell][run]["CV"]: if not IDCV in dw[prcell]["CV"].keys(): dw[prcell]["CV"][IDCV] = [] dw[prcell]["CV"][IDCV].append( float(dprob[prcell][run]["CV"][IDCV]["Predict"])) for prcell in dw.keys(): # train pfiloutTrain = prout + prcell + "_train" filoutTrain = open(pfiloutTrain, "w") filoutTrain.write("ID\tMpred\tSDpred\tReal\n") for IDtrain in dw[prcell]["train"].keys(): try: filoutTrain.write("%s\t%.3f\t%.3f\t%s\n" % (IDtrain, mean(dw[prcell]["train"][IDtrain]), std(dw[prcell]["train"][IDtrain]), dreal[prcell][IDtrain]["Aff"])) except: print dw[prcell]["train"][IDtrain] print dreal[prcell][IDtrain]["Aff"] ddd filoutTrain.close() runExternalSoft.plotAC50VSProb(pfiloutTrain, prout) #test pfiloutTest = prout + prcell + "_test" filoutTest = open(pfiloutTest, "w") filoutTest.write("ID\tMpred\tSDpred\tReal\n") for IDtest in dw[prcell]["test"].keys(): filoutTest.write("%s\t%.3f\t%.3f\t%s\n" % (IDtest, mean(dw[prcell]["test"][IDtest]), std(dw[prcell]["test"][IDtest]), dreal[prcell][IDtest]["Aff"])) filoutTest.close() runExternalSoft.plotAC50VSProb(pfiloutTest, prout) #CV pfiloutCV = prout + prcell + "_CV" filoutCV = open(pfiloutCV, "w") filoutCV.write("ID\tMpred\tSDpred\tReal\n") for IDCV in dw[prcell]["CV"].keys(): filoutCV.write( "%s\t%.3f\t%.3f\t%s\n" % (IDCV, mean(dw[prcell]["CV"][IDCV]), std( dw[prcell]["CV"][IDCV]), dreal[prcell][IDCV]["Aff"])) filoutCV.close() runExternalSoft.plotAC50VSProb(pfiloutCV, prout) return 0
def writeClassActive(self): from random import shuffle print self.dpresult print self.pAC50All dAC50All = toolbox.loadMatrix(self.pAC50All, sep="\t") for typeAC50 in self.dpresult: pclass = self.dpresult[typeAC50] + "actClass.txt" if path.exists(pclass) and path.getsize(pclass) > 10: self.dpAC50[typeAC50] = pclass else: filin = open(self.dpAC50[typeAC50], "r") llines = filin.readlines() filin.close() filout = open(pclass, "w") filout.write(llines[0]) # shuffle lines llines = llines[1:] shuffle(llines) nbact = 0 for lineChem in llines: AC50 = lineChem.strip().split("\t")[-1] if AC50 != "NA": nbact = nbact + 1 nbinact = int(100 * nbact / (100 * self.ratioAct)) - nbact # select active chemical llineAct = [] for lineChem in llines[1:]: lAC50 = lineChem.strip().split("\t") lnew = [lAC50[0]] for AC50 in lAC50[1:]: if AC50 != "NA": lnew.append("1") llineAct.append("\t".join(lnew)) # select inactive but select active for other channel if typeAC50 != "Luc_IC50": # add channel active in the set llineInact = [] for CASID in dAC50All.keys(): if dAC50All[CASID][self.cell + "_" + typeAC50] != "NA": continue else: for channel in dAC50All[CASID].keys(): if not search("Luc_IC50", channel): if channel != "CASID": if dAC50All[CASID][channel] != "NA": lnew = [CASID, "0"] llineInact.append("\t".join(lnew)) break # random active nbinactselected = len(llineInact) print nbinact, nbinactselected if nbinactselected >= nbinact: shuffle(llineInact) llineInact = llineInact[:nbinact] lw = llineAct + llineInact shuffle(lw) else: nwinact = nbinactselected # first loop to take inactive for lineChem in llines[1:]: lAC50 = lineChem.strip().split("\t") lnew = [lAC50[0]] for AC50 in lAC50[1:]: if AC50 == "NA": lnew.append("0") lneww = "\t".join(lnew) if not lneww in llineInact: llineInact.append(lneww) nwinact += 1 break if nwinact >= nbinact: break lw = llineAct + llineInact shuffle(lw) filout.write("\n".join(lw)) filout.close() self.dpAC50[typeAC50] = pclass
def prepDataColor(self): # format by type of AC50 # change self with one folder by type of AC50 presult = pathFolder.createFolder(self.prresult + self.cell + "/") pClass = presult + "AC50_" + str(self.cell) # by pass if path.exists(presult + "trainSet.csv") and path.exists( presult + "testSet.csv") and path.exists(pClass): dtrain = {} dtrain[self.cell] = presult + "trainSet.csv" dtest = {} dtest[self.cell] = presult + "testSet.csv" self.dptrain = dtrain self.dptest = dtest self.dpAC50 = {} self.dpAC50[self.cell] = pClass self.dpresult = {} self.dpresult[self.cell] = presult return 0 from random import shuffle color = self.cell + "_n" dAC50 = toolbox.loadMatrix(self.pAC50All, sep="\t") fclass = open(pClass, "w") fclass.write("CAS\tAff\n") lCASID = dAC50.keys()[1:] # remove "" shuffle(lCASID) lact = [] linact = [] for CASID in lCASID: flagAct = 0 for channel in dAC50[CASID].keys(): if search(color, channel): #print dAC50[CASID][channel] if dAC50[CASID][channel] != "NA": lact.append(str(CASID) + "\t1") flagAct = 1 break if flagAct == 0: linact.append(str(CASID) + "\t0") nbinact = int(100 * len(lact) / (100 * self.ratioAct)) - len(lact) lw = lact + linact[:nbinact] shuffle(lw) fclass.write("\n".join(lw)) fclass.close() runExternalSoft.prepDataQSAR(self.pdesc, pClass, presult, self.corval, self.maxQauntile, self.splitRatio, self.nbNA, "0") dtrain = {} dtrain[self.cell] = presult + "trainSet.csv" dtest = {} dtest[self.cell] = presult + "testSet.csv" self.dptrain = dtrain self.dptest = dtest self.dpAC50 = {} self.dpAC50[self.cell] = pClass self.dpresult = {} self.dpresult[self.cell] = presult
def prepData(self, typeData): # format by type of AC50 # change self with one folder by type of AC50 dAC50 = toolbox.loadMatrix(self.pAC50, sep="\t") dfileAC50 = {} dprresult = {} imax = len(self.lchannel) i = 0 while i < imax: AC50type = self.lchannel[i] presult = pathFolder.createFolder(self.prresult + AC50type + "/") dprresult[AC50type] = presult dfileAC50[AC50type] = open(presult + "AC50_" + str(AC50type), "w") dfileAC50[AC50type].write("CAS\tAff\n") i += 1 for CAS in dAC50.keys(): for channel in self.lchannel: dfileAC50[channel].write( str(CAS) + "\t" + str(dAC50[CAS][channel]) + "\n") for typeAC50 in self.lchannel: dfileAC50[typeAC50].close() dfileAC50[typeAC50] = dfileAC50[typeAC50].name self.dpAC50 = dfileAC50 self.dpresult = dprresult dtrain = {} dtest = {} for typeAC50 in self.dpAC50.keys(): if self.typeQSAR == "Reg": runExternalSoft.prepDataQSAR(self.pdesc, self.dpAC50[typeAC50], self.dpresult[typeAC50], self.corval, self.maxQauntile, self.splitRatio, self.nbNA) else: if typeData == "all": self.writeClass() elif typeData == "active": self.writeClassActive() ptrain = self.dpresult[typeAC50] + "trainSet.csv" ptest = self.dpresult[typeAC50] + "testSet.csv" print ptrain print ptest if not path.exists(ptrain) and not path.exists(ptest): runExternalSoft.prepDataQSAR(self.pdesc, self.dpAC50[typeAC50], self.dpresult[typeAC50], self.corval, self.maxQauntile, self.splitRatio, self.nbNA) dtrain[typeAC50] = ptrain dtest[typeAC50] = ptest self.dptrain = dtrain self.dptest = dtest
def predictSMI(self, nameChemical, smiles, plot=0, verbose=0): dpred = {} prresult = pathFolder.createFolder(self.prout + nameChemical + "/") ppred = prresult + "pred" if path.exists(ppred): dpred = toolbox.loadMatrix(ppred) return dpred chem = chemical.chemical(nameChemical, smiles) chem.prepareChem(prresult) chem.compute1D2DDesc(prresult) chem.writeTablesDescCAS(prresult) chem.computeFP(typeFP="All") for channel in self.dcluster: for cell in self.dcluster[channel].keys(): kpred = str(cell) + "_" + str(channel) dpred[kpred] = {} for typeDesc in self.dcluster[channel][cell].keys(): if verbose == 1: print channel, cell, typeDesc print self.dcluster[channel][cell].keys() if search("Desc", typeDesc): distMeth = typeDesc.split("-")[1] aggMeth = typeDesc.split("-")[2] enrichment = runExternalSoft.findCluster( self.cDB.pdesc1D2Dclean, chem.pdesc, self.dcluster[channel][cell][typeDesc]["files"][0], self.dcluster[channel][cell][typeDesc]["files"][1], distMeth, aggMeth) else: # generate FP typeFP = typeDesc.split("-")[0] metric = typeDesc.split("-")[-1].split("_")[0] metricAgg = typeDesc.split("-")[-1] if verbose == 1: print typeFP, metric dFP = {} for CASID in self.cDB.dFP.keys(): if verbose == 1: print self.cDB.dFP[CASID] print chem.FP[typeFP] print metric dFP[CASID] = float( toolbox.computeSimilarityFP( self.cDB.dFP[CASID][typeFP], chem.FP[typeFP], metric)) maxSim = max(dFP.values()) i = 0 imax = len(dFP.keys()) lCAS = dFP.keys() while i < imax: if float(dFP[lCAS[i]] == maxSim): CASclose = lCAS[i] i += 1 if verbose == 1: print CASclose print channel, cell print self.ChemClust[CASclose][channel][cell] clusterfound = self.ChemClust[CASclose][channel][cell][ str(typeFP) + "-" + str(metricAgg)] enrichment = self.dcluster[channel][cell][typeDesc][ clusterfound]['Enrichment'] dpred[kpred][typeDesc] = enrichment if plot == 1: self.writeResultBySMI(dpred, prresult) return dpred
def __init__(self, pDYE, lcAssays, prout): self.prout = prout self.pDYE = pDYE self.lassays = lcAssays self.dDye = toolbox.loadMatrix(self.pDYE, sep = ",")
def prepareActiveMatrix(self, corval, maxQuantile, NBNA, pAC50All, prout, luciferase=0): self.corval = corval self.maxQuantile = maxQuantile pdescAct = prout + "descActive" pAC50Act = prout + "AC50Active" if path.exists( pdescAct) and path.getsize(pdescAct) > 10 and path.exists( pAC50Act) and path.getsize(pAC50Act) > 10: lpdescActClean = runExternalSoft.dataManager( pdescAct, pAC50Act, corval, maxQuantile, NBNA, prout) self.pdescCleanActive = lpdescActClean[0] self.pAC50AllActive = lpdescActClean[1] return [self.pdescCleanActive, self.pAC50AllActive] ddesc = toolbox.loadMatrix(self.pdesc1D2D) dAC50All = toolbox.loadMatrix(pAC50All) if luciferase == 0: i = 0 imax = len(ddesc.keys()) while i < imax: casID = dAC50All.keys()[i] nbNA = 0 for kAC50 in dAC50All[casID].keys(): if kAC50 == "CASID" or kAC50 == "Luc_IC50": # not considered luciferase continue else: if dAC50All[casID][kAC50] == "NA": nbNA += 1 #print nbNA, len(dAC50All[casID].keys()) if nbNA == (len(dAC50All[casID].keys()) - 2): del dAC50All[casID] try: del ddesc[casID] except: pass imax = imax - 1 else: i += 1 toolbox.writeMatrix(ddesc, pdescAct) toolbox.writeMatrix(dAC50All, pAC50Act) lpdescActClean = runExternalSoft.dataManager( pdescAct, pAC50Act, corval, maxQuantile, NBNA, prout) self.pdescCleanActive = lpdescActClean[0] self.pAC50AllActive = lpdescActClean[1] return [self.pdescCleanActive, self.pAC50AllActive] else: i = 0 imax = len(dAC50All.keys()) while i < imax: casID = dAC50All.keys()[i] if not casID in ddesc.keys(): del dAC50All[casID] imax = imax - 1 i = i - 1 continue for kAC50 in dAC50All[casID].keys(): if kAC50 != "Luc_IC50" and kAC50 != "CASID": # not considered luciferase del dAC50All[casID][kAC50] else: if dAC50All[casID][kAC50] == "NA": del dAC50All[casID] try: del ddesc[casID] except: pass imax = imax - 1 i = i - 1 break i += 1 toolbox.writeMatrix(ddesc, pdescAct) toolbox.writeMatrix(dAC50All, pAC50Act) lpdescActClean = runExternalSoft.dataManager( pdescAct, pAC50Act, corval, maxQuantile, NBNA, prout) self.pdescCleanActive = lpdescActClean[0] self.pAC50AllActive = lpdescActClean[1] return [self.pdescCleanActive, self.pAC50AllActive]
def extractActivebySOM(self, prin=""): if prin == "": prin = self.prSOMactive lfolders = listdir(prin) dAC50all = toolbox.loadMatrix(self.pAC50AllActive, sep=",") for assay in lfolders: pclust = prin + assay + "/SOMClust" if not path.exists(pclust): continue fclust = open(pclust, "r") lchemicals = fclust.readlines() fclust.close() for lineChem in lchemicals[1:]: lchemClust = lineChem.strip().replace("\"", "").split(",") CAS = lchemClust[0] clust = lchemClust[-1] if CAS == "NA": continue #print CAS, clust if assay in dAC50all[CAS].keys(): if dAC50all[CAS][assay] != "NA": pclust = pathFolder.createFolder(prin + assay + "/" + str(clust) + "/") copyfile(self.prPNG + CAS + ".png", pclust + CAS + ".png") continue elif assay == "red" or assay == "green" or assay == "blue" or assay == "allcolors": lassays = [ "hepg2_cell_X_n", "hepg2_med_X_n", "hek293_med_X_n", "hek293_cell_X_n" ] if assay == "allcolors": lassayout = [] lassayout = lassayout + [ i.replace("X", "blue") for i in lassays ] + [i.replace("X", "green") for i in lassays ] + [i.replace("X", "red") for i in lassays] lassays = lassayout else: lassays = [i.replace("X", assay) for i in lassays] for ass in lassays: if dAC50all[CAS][ass] != "NA": pclust = pathFolder.createFolder(prin + assay + "/" + str(clust) + "/") copyfile(self.prPNG + CAS + ".png", pclust + CAS + ".png") break elif search("hepg2", assay) or search("hek293", assay): lassays = ["X_cell_Y_n", "X_med_Y_n"] lass = assay.split("_") lassays = [ i.replace("X", lass[0]).replace("Y", lass[1]) for i in lassays ] for ass in lassays: if dAC50all[CAS][ass] != "NA": pclust = pathFolder.createFolder(prin + assay + "/" + str(clust) + "/") copyfile(self.prPNG + CAS + ".png", pclust + CAS + ".png") break
def rank_chem(p_dataset, p_cluster, pr_PNG, pr_out): # load files d_MIC = toolbox.loadMatrix(p_dataset) d_cluster = toolbox.loadMatrix(p_cluster, sep=",") # rank pMIC d_rank = {} for chem in d_MIC.keys(): for orga in d_MIC[chem].keys(): if orga == "SMILES" or orga == 'CMPD_CHEMBLID': continue if not orga in d_rank.keys(): d_rank[orga] = [] pMIC = -log10(float(d_MIC[chem][orga])) d_MIC[chem][orga] = pMIC d_rank[orga].append(pMIC) # order pMIC for orga in d_rank.keys(): d_rank[orga].sort(reverse=True) # reorganise d_cluster d_cluster_used = {} for chem in d_cluster.keys(): cluster = int(d_cluster[chem]["cluster"]) if not cluster in d_cluster_used.keys(): d_cluster_used[cluster] = [] d_cluster_used[cluster].append(chem) # build the png lcluster = d_cluster_used.keys() lcluster.sort() i_page = 1 l_pngout = [] for cluster in lcluster: nchem = len(d_cluster_used[cluster]) l_pchempng = [] lw = [] i_chem = 0 nchem_page = 0 while i_chem <= nchem: if nchem_page == 6 or i_chem == nchem: p_image = pr_out + "page_" + str(i_page) + ".png" l_pngout.append(p_image) imgnew = Image.new("RGBA", (1535, 1285), (250, 250, 250)) i_image = 0 i_img_max = len(l_pchempng) while i_image < i_img_max: # put png img1 = Image.open(l_pchempng[i_image]) if i_image < 3: imgnew.paste(img1, (0 + i_image * 510, 0)) draw = ImageDraw.Draw(imgnew) draw.text((5 + 510 * i_image, 500), lw[0 + (i_image * 5)], (0, 0, 0), font=font) draw.text((5 + 510 * i_image, 525), lw[1 + (i_image * 5)], (0, 0, 0), font=font) draw.text((5 + 510 * i_image, 550), lw[2 + (i_image * 5)], (0, 0, 0), font=font) draw.text((5 + 510 * i_image, 575), lw[3 + (i_image * 5)], (0, 0, 0), font=font) draw.text((5 + 510 * i_image, 600), lw[4 + (i_image * 5)], (0, 0, 0), font=font) else: imgnew.paste(img1, (0 + (i_image - 3) * 510, 650)) draw = ImageDraw.Draw(imgnew) draw.text((5 + 510 * (i_image - 3), 1150), lw[15 + ((i_image - 3) * 5)], (0, 0, 0), font=font) draw.text((5 + 510 * (i_image - 3), 1175), lw[16 + ((i_image - 3) * 5)], (0, 0, 0), font=font) draw.text((5 + 510 * (i_image - 3), 1200), lw[17 + ((i_image - 3) * 5)], (0, 0, 0), font=font) draw.text((5 + 510 * (i_image - 3), 1225), lw[18 + ((i_image - 3) * 5)], (0, 0, 0), font=font) draw.text((5 + 510 * (i_image - 3), 1250), lw[19 + ((i_image - 3) * 5)], (0, 0, 0), font=font) i_image = i_image + 1 imgnew.save(p_image) # add nchem_page = 0 lw = [] l_pchempng = [] i_page = i_page + 1 if i_chem == nchem: break else: continue l_pchempng.append(pr_PNG + d_cluster_used[cluster][i_chem] + ".png") lw.append("%s (cluster: %s)" % (d_cluster_used[cluster][i_chem], cluster)) lw.append( "pMIC (E. coli): %.2f (%s)" % (d_MIC[d_cluster_used[cluster][i_chem]]["Escherichia coli"], d_rank["Escherichia coli"].index(d_MIC[ d_cluster_used[cluster][i_chem]]["Escherichia coli"]) + 1)) lw.append( "pMIC (P. aeruginosa): %.2f (%s)" % (d_MIC[d_cluster_used[cluster] [i_chem]]["Pseudomonas aeruginosa"], d_rank["Pseudomonas aeruginosa"].index(d_MIC[d_cluster_used[ cluster][i_chem]]["Pseudomonas aeruginosa"]) + 1)) lw.append( "pMIC (S. aureus): %.2f (%s)" % (d_MIC[d_cluster_used[cluster] [i_chem]]["Staphylococcus aureus"], d_rank["Staphylococcus aureus"].index(d_MIC[d_cluster_used[ cluster][i_chem]]["Staphylococcus aureus"]) + 1)) lw.append( "pMIC (S. pneumoniae): %.2f (%s)" % (d_MIC[d_cluster_used[cluster] [i_chem]]["Streptococcus pneumoniae"], d_rank["Streptococcus pneumoniae"].index(d_MIC[d_cluster_used[ cluster][i_chem]]["Streptococcus pneumoniae"]) + 1)) nchem_page = nchem_page + 1 i_chem = i_chem + 1 # transform png to pdf lpdf = [] for ppng in l_pngout: ppdf = runExternalSoft.pngtopdf(ppng) lpdf.append(ppdf) # merge pdf sheet runExternalSoft.mergepdfs(lpdf, pr_out + "chem_pMIC.pdf")
def applyModel(pdesc, prmodels, prout): lfolderModel = listdir(prmodels) dmodel = {} for folderModel in lfolderModel: dmodel[folderModel] = {} lmodel = listdir(prmodels + folderModel) for model in lmodel: dmodel[folderModel][model] = {} prmodel = prmodels + folderModel + "/" + model for ML in listdir(prmodel): prmodelML = prmodels + folderModel + "/" + model + "/" + ML + "/" lmodelR = listdir(prmodelML) dmodel[folderModel][model][ML] = [ prmodels + folderModel + "/" + model + "/" + ML + "/" + modelR for modelR in lmodelR ] for typeModel in dmodel.keys(): pathFolder.createFolder(prout + typeModel + "/") for color in dmodel[typeModel].keys(): pathFolder.createFolder(prout + typeModel + "/" + color + "/") dperf = {} for ML in dmodel[typeModel][color].keys(): proutbyRmodel = pathFolder.createFolder(prout + typeModel + "/" + color + "/" + ML + "/") lpredict = [] for modelR in dmodel[typeModel][color][ML]: ppredict = runExternalSoft.predictModel( pdesc, modelR, ML, proutbyRmodel) lpredict.append(ppredict) dprob = {} for ppredict in lpredict: try: dpredict = toolbox.loadMatrix(ppredict, sep=",") except: continue for chem in dpredict.keys(): if not chem in dprob: dprob[chem] = {} dprob[chem]["Pred"] = [] dprob[chem]["Aff"] = dpredict[chem]["Aff"] if dpredict[chem]["Pred"] != "NA": dprob[chem]["Pred"].append( float(dpredict[chem]["Pred"])) pfsumML = proutbyRmodel + "sumProb" fsumML = open(pfsumML, "w") fsumML.write("ID,Mpred,SDpred,Real\n") for chem in dprob.keys(): if len(dprob[chem]["Pred"]) == 0: fsumML.write("%s,NA,NA,%s\n" % (chem, dprob[chem]["Aff"])) else: fsumML.write( "%s,%f,%f,%s\n" % (chem, mean(dprob[chem]["Pred"]), std(dprob[chem]["Pred"]), dprob[chem]["Aff"])) fsumML.close() pquality = runExternalSoft.qualityPred(pfsumML) runExternalSoft.plotAC50VSProb(pfsumML, proutbyRmodel) dperf[ML] = toolbox.loadMatrix(pquality, sep=",") pfsum = prout + typeModel + "/" + color + "/sumPerf.csv" fsum = open(pfsum, "w") lh = [ "TP", "TN", "FP", "FN", "acc", "se", "sp", "mcc", "MpbTP", "SDpbTP", "MpbTN", "SDpbTN", "MpbFP", "SDpbFP", "MpbFN", "SDpbFN" ] fsum.write("Model," + ",".join(lh) + "\n") for ML in dperf.keys(): lw = [dperf[ML][h]["x"] for h in lh] i = 0 imax = len(lw) while i < imax: try: lw[i] = str(round(float(lw[i]), 2)) except: lw[i] = str(lw[i]) i += 1 fsum.write("%s,%s\n" % (ML, ",".join(lw))) fsum.close()