def calcRMSD_pymol(uf, bf): """ Given two pdb files of the same protein, this function calculates the rmsd, asa for each and the molecular weight of each using Pymol """ # Call the function below before using any PyMOL modules. #time.sleep(random.random()) cmd.set("dot_solvent", 1) cmd.load(uf) cmd.load(bf) #cmd.h_add() #cmd.remove('het') _, un, _ = getFileParts(uf) _, bn, _ = getFileParts(bf) asa_u = cmd.get_area(un) asa_b = cmd.get_area(bn) umass = cmd.get_model(un).get_mass() bmass = cmd.get_model(bn).get_mass() #rms=cmd.super(un,bn,transform=1)[0] #time.sleep(random.random()) bv0 = [] cmd.iterate('all', 'bv0.append(b)', space=locals()) cmd.do('run colorbyrmsd.py; colorbyrmsd \'' + un + '\',\'' + bn + '\',guide = 0,doAlign=1, doPretty=1') while True: # synchronization bv1 = [] cmd.iterate('all', 'bv1.append(b)', space=locals()) if bv0 != bv1: time.sleep(0.1) break out_file = tempfile.NamedTemporaryFile(suffix='.pdb') out_file.close() tmp_pdb = out_file.name updb = tmp_pdb + 'u' bpdb = tmp_pdb + 'b' cmd.save(updb, un) cmd.save(bpdb, bn) (_, uR, _, _, _) = readPDB(updb) urmsd = getBvalues(uR) os.remove(updb) (_, bR, _, _, _) = readPDB(bpdb) brmsd = getBvalues(bR) os.remove(bpdb) rms = np.sqrt(np.mean( np.array([v for V in urmsd for v in V if v >= 0])**2)) #(_,urx,_,_,_)=readPDB(uf); ux=getBvalues(urx); # if np.abs(rms-rmsd)>0.1: # print "RMSD =",rms,rmsd # pdb.set_trace() cmd.reinitialize() pdb.set_trace() return rms, asa_u, asa_b, umass, bmass, urmsd, brmsd
def batchExtract(pkldir, bdir, ofname): """ Running the information required for all files """ import glob flist = glob.glob(pkldir + '*.pdb.pkl') TT = len(flist) + 0.0 if os.path.isfile(ofname) is False: fdict = {} else: fdict = myPickle.load(ofname) for cnt, f in enumerate(flist): print '% Done =', cnt / TT (_, k, _) = getFileParts(getFileParts(f)[1]) #pdb.set_trace() k = k[:-2] if k not in fdict: print "Processing", f try: U = myPDB.loader(pkldir + k + '_u.pdb.pkl') B = myPDB.loader(pkldir + k + '_b.pdb.pkl') except: continue pdb.set_trace() #rmsd,Uidx,Bidx=calcRMSD(U,B) try: rpymol = calcRMSD_pymol(bdir + k + '_u.pdb', bdir + k + '_b.pdb') except: print "Error processing", k cmd.reinitialize() time.sleep(0.1) continue #pdb.set_trace() #useq=''.join([three_to_one(U.R[i].get_resname()) for i in Uidx]) #bseq=''.join([three_to_one(B.R[i].get_resname()) for i in Bidx]) #a_useq=ProteinAnalysis(U.seq) #a_bseq=ProteinAnalysis(B.seq) #asa_u=np.sum([U.ASA[i] for i in Uidx]) #asa_b=np.sum([B.ASA[i] for i in Bidx]) fdict[ k] = rpymol #+(BN.nanmean(U.B),BN.nanmean(B.B),BN.nanmedian(U.B),BN.nanmedian(B.B),BN.nanmax(U.B),BN.nanmax(B.B)) #pdb.set_trace() myPickle.dump(ofname, fdict) print k, rpymol[0] else: print "Already found", f return fdict
def parse1SVM(ifile, auconly=False, **kwargs): #,E,Asgl exfname = 'EP_6N.lbl.pkl' sglfile = 'result.sgl.pkl' try: E except NameError: E = getExamplesDBD.loader(exfname) try: Asgl except NameError: Asgl = cPickle.load(open(sglfile, "rb")) cid = getFileParts(getFileParts(ifile)[1])[1][:4] (la, ra, lrV, rrV) = Asgl[cid] I = [] J = [] V = [] L = [] Mv = np.zeros((len(lrV), len(rrV))) Ml = np.zeros(Mv.shape) for lidx, xr in enumerate(lrV.keys()): for ridx, xc in enumerate(rrV.keys()): if (xr, xc) in E.Pex[cid][0]: l = +1.0 else: l = -1.0 I.append(xr) J.append(xc) v = lrV[xr][0] + rrV[xc][0] V.append(v) L.append(l) Mv[lidx, ridx] = v Ml[lidx, ridx] = l #pdb.set_trace() # for idx in range(len(I)): # Mv[I[idx],J[idx]]=V[idx] # Ml[I[idx],J[idx]]=L[idx] (_, _, auc) = roc.roc(list(Mv.flatten()), list(Ml.flatten())) if auconly: return auc return (auc, Mv, Ml, None, None, lrV, rrV) #auc,Mvm,Mlm,None,None,lrV,rrV
def parse1SVM(ifile,auconly=False,**kwargs):#,E,Asgl exfname='EP_6N.lbl.pkl' sglfile='result.sgl.pkl' try: E except NameError: E=getExamplesDBD.loader(exfname) try: Asgl except NameError: Asgl=cPickle.load(open(sglfile, "rb" )) cid=getFileParts(getFileParts(ifile)[1])[1][:4] (la,ra,lrV,rrV)=Asgl[cid] I=[] J=[] V=[] L=[] Mv=np.zeros((len(lrV),len(rrV))) Ml=np.zeros(Mv.shape) for lidx,xr in enumerate(lrV.keys()): for ridx,xc in enumerate(rrV.keys()): if (xr,xc) in E.Pex[cid][0]: l=+1.0 else: l=-1.0 I.append(xr) J.append(xc) v=lrV[xr][0]+rrV[xc][0] V.append(v) L.append(l) Mv[lidx,ridx]=v Ml[lidx,ridx]=l #pdb.set_trace() # for idx in range(len(I)): # Mv[I[idx],J[idx]]=V[idx] # Ml[I[idx],J[idx]]=L[idx] (_,_,auc)=roc.roc(list(Mv.flatten()),list(Ml.flatten())) if auconly: return auc return (auc,Mv,Ml,None,None,lrV,rrV) #auc,Mvm,Mlm,None,None,lrV,rrV
def parseShandarFiles(ifile,auconly=False,**kwargs): #(auc,Mv,Ml,lseq,rseq,lrV,rrV) """ Reads shandar's output files with labels (made on the same pattern as analyzePredFile.readFile) """ def parseContLine(ln): # ['A', '#5', 'ASN:7', 'N', '::', 'B', '#5', 'HIS:6', 'H:', '0', '53.61'] # 0 1 2 3 4 5 6 7 8 9 10 lns=ln.split() lidx=lns[0]+lns[1] ridx=lns[5]+lns[6] lbl=int(lns[9]) return (lidx,ridx,lbl) loopath,cid,_=getFileParts(ifile) lcids=cid.split('_')[1] rcids=cid.split('_')[2] Mlidx={} Mridx={} Mlv=[] l=0 r=0 with open(os.path.join(loopath,cid+'.preds')) as fp,open(os.path.join(loopath,cid+'.cont')) as fc: for lnp,lnc in zip(fp,fc): (lidx,ridx,lbl)=parseContLine(lnc) if lidx[0] in lcids and ridx[0] in rcids: try: lx=Mlidx[lidx] except: Mlidx[lidx]=l lx=l l=l+1 try: rx=Mridx[ridx] except: Mridx[ridx]=r rx=r r=r+1 p=float(lnp) Mlv.append((lx,rx,lbl,p)) Mvm=np.zeros((l,r)) Mvm.fill(np.nan) Mlm=np.zeros((l,r)) for i in range(len(Mlv)): Mlm[Mlv[i][0],Mlv[i][1]]=Mlv[i][2] Mvm[Mlv[i][0],Mlv[i][1]]=Mlv[i][3] (_,_,auc)=roc.roc(list(Mvm.flatten()),list(Mlm.flatten())) if auconly: return auc #construct lrV,rrV lrV=dict(zip(range(Mvm.shape[0]),zip(np.max(Mvm,axis=1),np.max(Mlm,axis=1)))) rrV=dict(zip(range(Mvm.shape[1]),zip(np.max(Mvm,axis=0),np.max(Mlm,axis=0)))) return auc,Mvm,Mlm,None,None,lrV,rrV
except ImportError: print "Failure importing MPI4py: Not using MPI parallelization." comm=None myid=0 nprocs=1 csize=int(np.ceil(len(fs)/float(nprocs))) gclist=list(chunks(fs,csize)) myfs=gclist[myid] LV=[] TPS=[] DNTP=[] dsA={} LVP=[] for i,ifile in enumerate(myfs): try: cid=getFileParts(getFileParts(ifile)[1])[1][:4] if incids is not None and cid not in incids: continue print "Processing",cid if auconly: auc=readFile(ifile,auconly=True) dsA[cid]=(auc,np.nan,np.nan) else: if postprocess: pauc,Mvc0,Mvc,Mlc,lseq,rseq,lrV0,lrV,rrV0,rrV=postProcessAvg(cid,pdbpklpath,loopath) ifile=(pauc,Mvc,Mlc,lseq,rseq,lrV,rrV) (auc,ttp,fpi,dntp,la,ra,pp,nn,Mvx,Mlx,lv,ll,rv,rl)=computeNTP(ifile,top=200) #lv,ll,rv,rl TPS.append([ttp,fpi,100.0*ttp/pp,pp,nn,pp+nn]) DNTP.append(dntp) LV.append((list(Mvx),list(Mlx)))
import glob import numpy as np from analyzeLOOCV import computeNTP, calcRFPP dir1 = './DBD3LOOCVSR/' dir2 = './DBD3LOOCV/' fs = glob.glob(dir1 + '*.pairpred.txt') from BISEPutils import getFileParts F1 = np.zeros((len(fs), 5)) F2 = np.zeros((len(fs), 5)) DNTP1 = [] DNTP2 = [] for i, f1 in enumerate(fs): fp = getFileParts(f1) f2 = dir2 + fp[1] + fp[2] (auc, ttp, fpi, dntp, la, ra, pp, nn, Mvx, Mlx) = computeNTP(f1) DNTP1.append(dntp) print i, f1, auc, ttp, fpi, la, ra F1[i, :] = [auc, ttp, fpi, la, ra] (auc, ttp, fpi, dntp, la, ra, pp, nn, Mvx, Mlx) = computeNTP(f2) DNTP2.append(dntp) print i, f2, auc, ttp, fpi, la, ra F2[i, :] = [auc, ttp, fpi, la, ra] FPI1 = F1[:, 2] FPI2 = F2[:, 2] print "RFPP for", dir1 calcRFPP(FPI1, DNTP1) print "RFPP for", dir2
import myPickle,re pdbpklpath='../../DBD4CSPKL/PKL/'#'../../DBD4N/PDBPKL4' #dirs=['../../DBD4_ESR_prop/','../../DBD4_NoESR_prop/','../../DBD4_SGDSVM/','../../DBD_SSVM_3E-6/','../../DBD_SPW_3E-6/'] #dirs=['../DBD4_ESR_prop/']#,'../SGD_DBD4/' #dirs=['../DBD4_SGDSVM/','../DBD_SPW_3E-8/'] #cids=['2OOB', '1EFN', '1PPE', '1J2J', '1GL1', '1SYX', '1Z0K', '1AY7', '1FFW', '3SGQ', '1S1Q', '1FLE', '7CEI', '2IDO', '1KTZ', '4CPA', '2UUY', '1R6Q', '1D6R', '1OC0', '1CGI', '1R0R', '1EAW', '1GCQ', '1XD3', '1LFD', '2I25', '1CLV', '1H9D', '1ACB', '2SNI', '3D5S', '1Z5Y', '2HRK', '2ABZ', '1UDI', '1PXV', '2J0T']#E.Pex.keys()[20:40] #dirs=['../../SGD_DBD4/'] dirs=['../../DBD4S_SMO196/','../../DBD4S_SGD196/','../../DBD4S_SGD196_CENT/','../../DBD4S_SGDCENTPW71/']#['../../DBD4_NoESR_prop/'] # FIND THE COMMON SET cids=None Re=(r"(\S+)\.",r"(\S+)\#(\S+)\.") for d in dirs: cids_d=[] for f in glob.glob(d+'/*.pairpred.txt'): cx=re.match(Re[int('#' in f)],getFileParts(f)[1]).groups() if len(cx)==1: cx=cx[0] cids_d.append(cx) if cids is None: cids=set(cids_d) else: cids=cids.intersection(cids_d) cids=list(cids) R={} for cid in cids: rfpp=[] for d in dirs: fname=d+('#'.join(cid))+'.pairpred.txt' try: (auc0,Mv0,Mv,Ml,lseq,rseq,lrV0,lrV,rrV0,rrV)=postProcessAvg(cid,pdbpklpath,d)
except ImportError: print "Failure importing MPI4py: Not using MPI parallelization." comm = None myid = 0 nprocs = 1 csize = int(np.ceil(len(fs) / float(nprocs))) gclist = list(chunks(fs, csize)) myfs = gclist[myid] LV = [] TPS = [] DNTP = [] dsA = {} LVP = [] for i, ifile in enumerate(myfs): try: cid = getFileParts(getFileParts(ifile)[1])[1][:4] if incids is not None and cid not in incids: continue print "Processing", cid if auconly: auc = readFile(ifile, auconly=True) dsA[cid] = (auc, np.nan, np.nan) else: if postprocess: pauc, Mvc0, Mvc, Mlc, lseq, rseq, lrV0, lrV, rrV0, rrV = postProcessAvg( cid, pdbpklpath, loopath) ifile = (pauc, Mvc, Mlc, lseq, rseq, lrV, rrV) (auc, ttp, fpi, dntp, la, ra, pp, nn, Mvx, Mlx, lv, ll, rv, rl) = computeNTP(ifile, top=200) #lv,ll,rv,rl TPS.append([ttp, fpi, 100.0 * ttp / pp, pp, nn, pp + nn])
@author: root """ import glob import numpy as np from analyzeLOOCV import computeNTP, calcRFPP dir1='./DBD3LOOCVSR/' dir2='./DBD3LOOCV/' fs=glob.glob(dir1+'*.pairpred.txt') from BISEPutils import getFileParts F1=np.zeros((len(fs),5)) F2=np.zeros((len(fs),5)) DNTP1=[] DNTP2=[] for i,f1 in enumerate(fs): fp=getFileParts(f1) f2=dir2+fp[1]+fp[2] (auc,ttp,fpi,dntp,la,ra,pp,nn,Mvx,Mlx)=computeNTP(f1); DNTP1.append(dntp) print i,f1,auc, ttp, fpi,la,ra F1[i,:]=[auc ,ttp ,fpi ,la ,ra] (auc,ttp,fpi,dntp,la,ra,pp,nn,Mvx,Mlx)=computeNTP(f2); DNTP2.append(dntp) print i,f2,auc, ttp, fpi,la,ra F2[i,:]=[auc ,ttp ,fpi ,la ,ra] FPI1=F1[:,2] FPI2=F2[:,2] print "RFPP for",dir1 calcRFPP(FPI1,DNTP1) print "RFPP for",dir2
if __name__ == '__main__': LV = [] TPS = [] DNTP = [] #auconly=False # whether to calculate the avg. auc of the complexes or do more #(auc,(fp,tp),(A,Rx,D,L,cids,r,dkey))=getAUC('./Results/result_tppk.res.pkl') #dAo=dict(zip(cids,A)) #AUCs from the training data set (CV) only loopath = './Shandar/data-sets/data-sets/' #C:\Users\Afsar\Desktop\pairpred\sequence only\DBD3LOOCVSEQ fsp = glob.glob(loopath + '*.preds') dA = {} dsA = {} #daoo={} for i, ifile in enumerate(fsp): cid = getFileParts(ifile)[1] print 'cid =', cid, 100 * float(i + 1) / len(fsp), '% done' (Ml, Mv) = parseShandarFiles(cid, loopath) #(auc,Mv,Ml,lseq,rseq,lrV,rrV)=readFile(ifile,usePDBidx=False);#(auc,Mv,Ml,lseq,rseq,lrV,rrV) #(la,lv,ll)=getAUC4Protein(lrV) #(ra,rv,rl)=getAUC4Protein(rrV) Mvx = Mv.ravel() Mlx = Ml.ravel() nidx = ~np.isnan(Mvx) & ~np.isnan(Mlx) Mvx[~nidx] = -np.inf (ttp, fpi, dntp) = findNTPinTop(Mvx, Mlx, Mv.shape, top=500) Mvx = Mvx[nidx] Mlx = Mlx[nidx] pp = np.sum(Mlx == 1) # total number of positives nn = len(Mlx) - pp #total number of negatives
return (Mlm,Mvm) if __name__=='__main__': LV=[] TPS=[] DNTP=[] #auconly=False # whether to calculate the avg. auc of the complexes or do more #(auc,(fp,tp),(A,Rx,D,L,cids,r,dkey))=getAUC('./Results/result_tppk.res.pkl') #dAo=dict(zip(cids,A)) #AUCs from the training data set (CV) only loopath='./Shandar/data-sets/data-sets/'#C:\Users\Afsar\Desktop\pairpred\sequence only\DBD3LOOCVSEQ fsp=glob.glob(loopath+'*.preds') dA={} dsA={} #daoo={} for i,ifile in enumerate(fsp): cid=getFileParts(ifile)[1] print 'cid =',cid,100*float(i+1)/len(fsp),'% done' (Ml,Mv)=parseShandarFiles(cid,loopath) #(auc,Mv,Ml,lseq,rseq,lrV,rrV)=readFile(ifile,usePDBidx=False);#(auc,Mv,Ml,lseq,rseq,lrV,rrV) #(la,lv,ll)=getAUC4Protein(lrV) #(ra,rv,rl)=getAUC4Protein(rrV) Mvx=Mv.ravel() Mlx=Ml.ravel() nidx=~np.isnan(Mvx) & ~np.isnan(Mlx) Mvx[~nidx]=-np.inf (ttp,fpi,dntp)=findNTPinTop(Mvx,Mlx,Mv.shape,top=500) Mvx=Mvx[nidx] Mlx=Mlx[nidx] pp=np.sum(Mlx==1) # total number of positives nn=len(Mlx)-pp #total number of negatives
def parseShandarFiles(ifile, auconly=False, **kwargs): #(auc,Mv,Ml,lseq,rseq,lrV,rrV) """ Reads shandar's output files with labels (made on the same pattern as analyzePredFile.readFile) """ def parseContLine(ln): # ['A', '#5', 'ASN:7', 'N', '::', 'B', '#5', 'HIS:6', 'H:', '0', '53.61'] # 0 1 2 3 4 5 6 7 8 9 10 lns = ln.split() lidx = lns[0] + lns[1] ridx = lns[5] + lns[6] lbl = int(lns[9]) return (lidx, ridx, lbl) loopath, cid, _ = getFileParts(ifile) lcids = cid.split('_')[1] rcids = cid.split('_')[2] Mlidx = {} Mridx = {} Mlv = [] l = 0 r = 0 with open(os.path.join(loopath, cid + '.preds')) as fp, open( os.path.join(loopath, cid + '.cont')) as fc: for lnp, lnc in zip(fp, fc): (lidx, ridx, lbl) = parseContLine(lnc) if lidx[0] in lcids and ridx[0] in rcids: try: lx = Mlidx[lidx] except: Mlidx[lidx] = l lx = l l = l + 1 try: rx = Mridx[ridx] except: Mridx[ridx] = r rx = r r = r + 1 p = float(lnp) Mlv.append((lx, rx, lbl, p)) Mvm = np.zeros((l, r)) Mvm.fill(np.nan) Mlm = np.zeros((l, r)) for i in range(len(Mlv)): Mlm[Mlv[i][0], Mlv[i][1]] = Mlv[i][2] Mvm[Mlv[i][0], Mlv[i][1]] = Mlv[i][3] (_, _, auc) = roc.roc(list(Mvm.flatten()), list(Mlm.flatten())) if auconly: return auc #construct lrV,rrV lrV = dict( zip(range(Mvm.shape[0]), zip(np.max(Mvm, axis=1), np.max(Mlm, axis=1)))) rrV = dict( zip(range(Mvm.shape[1]), zip(np.max(Mvm, axis=0), np.max(Mlm, axis=0)))) return auc, Mvm, Mlm, None, None, lrV, rrV