Example #1
0
def calcRMSD_pymol(uf, bf):
    """
    Given two pdb files of the same protein, this function calculates the
    rmsd, asa for each and the molecular weight of each using Pymol
    """
    # Call the function below before using any PyMOL modules.
    #time.sleep(random.random())
    cmd.set("dot_solvent", 1)
    cmd.load(uf)
    cmd.load(bf)
    #cmd.h_add()
    #cmd.remove('het')
    _, un, _ = getFileParts(uf)
    _, bn, _ = getFileParts(bf)
    asa_u = cmd.get_area(un)
    asa_b = cmd.get_area(bn)

    umass = cmd.get_model(un).get_mass()
    bmass = cmd.get_model(bn).get_mass()
    #rms=cmd.super(un,bn,transform=1)[0]
    #time.sleep(random.random())
    bv0 = []
    cmd.iterate('all', 'bv0.append(b)', space=locals())
    cmd.do('run colorbyrmsd.py; colorbyrmsd \'' + un + '\',\'' + bn +
           '\',guide = 0,doAlign=1, doPretty=1')
    while True:  # synchronization
        bv1 = []
        cmd.iterate('all', 'bv1.append(b)', space=locals())
        if bv0 != bv1:
            time.sleep(0.1)
            break

    out_file = tempfile.NamedTemporaryFile(suffix='.pdb')
    out_file.close()
    tmp_pdb = out_file.name
    updb = tmp_pdb + 'u'
    bpdb = tmp_pdb + 'b'
    cmd.save(updb, un)
    cmd.save(bpdb, bn)
    (_, uR, _, _, _) = readPDB(updb)
    urmsd = getBvalues(uR)
    os.remove(updb)
    (_, bR, _, _, _) = readPDB(bpdb)
    brmsd = getBvalues(bR)
    os.remove(bpdb)
    rms = np.sqrt(np.mean(
        np.array([v for V in urmsd for v in V if v >= 0])**2))
    #(_,urx,_,_,_)=readPDB(uf); ux=getBvalues(urx);
    #    if np.abs(rms-rmsd)>0.1:
    #        print "RMSD =",rms,rmsd
    #        pdb.set_trace()

    cmd.reinitialize()
    pdb.set_trace()
    return rms, asa_u, asa_b, umass, bmass, urmsd, brmsd
Example #2
0
def batchExtract(pkldir, bdir, ofname):
    """
    Running the information required for all files
    """
    import glob

    flist = glob.glob(pkldir + '*.pdb.pkl')
    TT = len(flist) + 0.0
    if os.path.isfile(ofname) is False:
        fdict = {}
    else:
        fdict = myPickle.load(ofname)
    for cnt, f in enumerate(flist):
        print '% Done =', cnt / TT
        (_, k, _) = getFileParts(getFileParts(f)[1])
        #pdb.set_trace()
        k = k[:-2]
        if k not in fdict:
            print "Processing", f
            try:
                U = myPDB.loader(pkldir + k + '_u.pdb.pkl')
                B = myPDB.loader(pkldir + k + '_b.pdb.pkl')
            except:
                continue
            pdb.set_trace()
            #rmsd,Uidx,Bidx=calcRMSD(U,B)
            try:
                rpymol = calcRMSD_pymol(bdir + k + '_u.pdb',
                                        bdir + k + '_b.pdb')
            except:
                print "Error processing", k
                cmd.reinitialize()
                time.sleep(0.1)
                continue

            #pdb.set_trace()
            #useq=''.join([three_to_one(U.R[i].get_resname()) for i in Uidx])
            #bseq=''.join([three_to_one(B.R[i].get_resname()) for i in Bidx])
            #a_useq=ProteinAnalysis(U.seq)
            #a_bseq=ProteinAnalysis(B.seq)
            #asa_u=np.sum([U.ASA[i] for i in Uidx])
            #asa_b=np.sum([B.ASA[i] for i in Bidx])
            fdict[
                k] = rpymol  #+(BN.nanmean(U.B),BN.nanmean(B.B),BN.nanmedian(U.B),BN.nanmedian(B.B),BN.nanmax(U.B),BN.nanmax(B.B))
            #pdb.set_trace()
            myPickle.dump(ofname, fdict)
            print k, rpymol[0]
        else:
            print "Already found", f
    return fdict
def parse1SVM(ifile, auconly=False, **kwargs):  #,E,Asgl

    exfname = 'EP_6N.lbl.pkl'
    sglfile = 'result.sgl.pkl'
    try:
        E
    except NameError:

        E = getExamplesDBD.loader(exfname)
    try:
        Asgl
    except NameError:
        Asgl = cPickle.load(open(sglfile, "rb"))

    cid = getFileParts(getFileParts(ifile)[1])[1][:4]
    (la, ra, lrV, rrV) = Asgl[cid]

    I = []
    J = []
    V = []
    L = []
    Mv = np.zeros((len(lrV), len(rrV)))
    Ml = np.zeros(Mv.shape)
    for lidx, xr in enumerate(lrV.keys()):
        for ridx, xc in enumerate(rrV.keys()):
            if (xr, xc) in E.Pex[cid][0]:
                l = +1.0
            else:
                l = -1.0
            I.append(xr)
            J.append(xc)
            v = lrV[xr][0] + rrV[xc][0]
            V.append(v)
            L.append(l)
            Mv[lidx, ridx] = v
            Ml[lidx, ridx] = l

    #pdb.set_trace()


#    for idx in range(len(I)):
#        Mv[I[idx],J[idx]]=V[idx]
#        Ml[I[idx],J[idx]]=L[idx]
    (_, _, auc) = roc.roc(list(Mv.flatten()), list(Ml.flatten()))
    if auconly:
        return auc

    return (auc, Mv, Ml, None, None, lrV, rrV)  #auc,Mvm,Mlm,None,None,lrV,rrV
def parse1SVM(ifile,auconly=False,**kwargs):#,E,Asgl
    
    exfname='EP_6N.lbl.pkl'
    sglfile='result.sgl.pkl' 
    try:
        E
    except NameError:
        
        E=getExamplesDBD.loader(exfname) 
    try:
        Asgl
    except NameError:
        Asgl=cPickle.load(open(sglfile, "rb" ))
    
    cid=getFileParts(getFileParts(ifile)[1])[1][:4]
    (la,ra,lrV,rrV)=Asgl[cid]
    
    I=[]
    J=[]
    V=[]
    L=[]
    Mv=np.zeros((len(lrV),len(rrV)))
    Ml=np.zeros(Mv.shape) 
    for lidx,xr in enumerate(lrV.keys()):
        for ridx,xc in enumerate(rrV.keys()):
            if (xr,xc) in E.Pex[cid][0]:
                l=+1.0
            else:
                l=-1.0
            I.append(xr)
            J.append(xc)
            v=lrV[xr][0]+rrV[xc][0]
            V.append(v)
            L.append(l)
            Mv[lidx,ridx]=v
            Ml[lidx,ridx]=l
    
    #pdb.set_trace()
#    for idx in range(len(I)):
#        Mv[I[idx],J[idx]]=V[idx]
#        Ml[I[idx],J[idx]]=L[idx]
    (_,_,auc)=roc.roc(list(Mv.flatten()),list(Ml.flatten()))
    if auconly:
        return auc
    
    return (auc,Mv,Ml,None,None,lrV,rrV) #auc,Mvm,Mlm,None,None,lrV,rrV
def parseShandarFiles(ifile,auconly=False,**kwargs): #(auc,Mv,Ml,lseq,rseq,lrV,rrV)
    """
    Reads shandar's output files with labels (made on the same pattern as analyzePredFile.readFile)
    """
    def parseContLine(ln):
        # ['A', '#5', 'ASN:7', 'N', '::', 'B', '#5', 'HIS:6', 'H:', '0', '53.61']
        #   0   1       2       3     4     5   6       7       8    9      10
        lns=ln.split()
        lidx=lns[0]+lns[1]
        ridx=lns[5]+lns[6]
        lbl=int(lns[9])
        return (lidx,ridx,lbl)
        
    loopath,cid,_=getFileParts(ifile)
    lcids=cid.split('_')[1]
    rcids=cid.split('_')[2]
    Mlidx={}
    Mridx={}
    Mlv=[]    
    l=0
    r=0
    with open(os.path.join(loopath,cid+'.preds')) as fp,open(os.path.join(loopath,cid+'.cont')) as fc:
        for lnp,lnc in zip(fp,fc):    
            (lidx,ridx,lbl)=parseContLine(lnc)
            if lidx[0] in lcids and ridx[0] in rcids:
                try:
                    lx=Mlidx[lidx]
                except:
                    Mlidx[lidx]=l
                    lx=l
                    l=l+1
                try:
                    rx=Mridx[ridx]
                except:
                    Mridx[ridx]=r
                    rx=r
                    r=r+1
                p=float(lnp)
                Mlv.append((lx,rx,lbl,p))                
    Mvm=np.zeros((l,r))
    Mvm.fill(np.nan)
    Mlm=np.zeros((l,r))
    for i in range(len(Mlv)):
        Mlm[Mlv[i][0],Mlv[i][1]]=Mlv[i][2]
        Mvm[Mlv[i][0],Mlv[i][1]]=Mlv[i][3]    
    
    (_,_,auc)=roc.roc(list(Mvm.flatten()),list(Mlm.flatten()))
    if auconly:
        return auc
    #construct lrV,rrV
    lrV=dict(zip(range(Mvm.shape[0]),zip(np.max(Mvm,axis=1),np.max(Mlm,axis=1))))
    rrV=dict(zip(range(Mvm.shape[1]),zip(np.max(Mvm,axis=0),np.max(Mlm,axis=0))))
    
    return auc,Mvm,Mlm,None,None,lrV,rrV
Example #6
0
 except ImportError:
     print "Failure importing MPI4py: Not using MPI parallelization."
     comm=None
     myid=0
     nprocs=1 
 csize=int(np.ceil(len(fs)/float(nprocs)))
 gclist=list(chunks(fs,csize))    
 myfs=gclist[myid]
 LV=[]   
 TPS=[]
 DNTP=[]
 dsA={}
 LVP=[]
 for i,ifile in enumerate(myfs):        
     try:
         cid=getFileParts(getFileParts(ifile)[1])[1][:4]
         if incids is not None and cid not in incids:
             continue
         print "Processing",cid
         
         if auconly:
             auc=readFile(ifile,auconly=True)
             dsA[cid]=(auc,np.nan,np.nan)     
         else:                
             if postprocess:
                 pauc,Mvc0,Mvc,Mlc,lseq,rseq,lrV0,lrV,rrV0,rrV=postProcessAvg(cid,pdbpklpath,loopath)
                 ifile=(pauc,Mvc,Mlc,lseq,rseq,lrV,rrV)
             (auc,ttp,fpi,dntp,la,ra,pp,nn,Mvx,Mlx,lv,ll,rv,rl)=computeNTP(ifile,top=200)    #lv,ll,rv,rl        
             TPS.append([ttp,fpi,100.0*ttp/pp,pp,nn,pp+nn])
             DNTP.append(dntp)
             LV.append((list(Mvx),list(Mlx))) 
Example #7
0
import glob
import numpy as np
from analyzeLOOCV import computeNTP, calcRFPP

dir1 = './DBD3LOOCVSR/'
dir2 = './DBD3LOOCV/'
fs = glob.glob(dir1 + '*.pairpred.txt')
from BISEPutils import getFileParts

F1 = np.zeros((len(fs), 5))
F2 = np.zeros((len(fs), 5))
DNTP1 = []
DNTP2 = []
for i, f1 in enumerate(fs):

    fp = getFileParts(f1)
    f2 = dir2 + fp[1] + fp[2]
    (auc, ttp, fpi, dntp, la, ra, pp, nn, Mvx, Mlx) = computeNTP(f1)
    DNTP1.append(dntp)
    print i, f1, auc, ttp, fpi, la, ra
    F1[i, :] = [auc, ttp, fpi, la, ra]
    (auc, ttp, fpi, dntp, la, ra, pp, nn, Mvx, Mlx) = computeNTP(f2)
    DNTP2.append(dntp)
    print i, f2, auc, ttp, fpi, la, ra
    F2[i, :] = [auc, ttp, fpi, la, ra]

FPI1 = F1[:, 2]
FPI2 = F2[:, 2]
print "RFPP for", dir1
calcRFPP(FPI1, DNTP1)
print "RFPP for", dir2
Example #8
0
import myPickle,re
pdbpklpath='../../DBD4CSPKL/PKL/'#'../../DBD4N/PDBPKL4'

#dirs=['../../DBD4_ESR_prop/','../../DBD4_NoESR_prop/','../../DBD4_SGDSVM/','../../DBD_SSVM_3E-6/','../../DBD_SPW_3E-6/']
#dirs=['../DBD4_ESR_prop/']#,'../SGD_DBD4/'
#dirs=['../DBD4_SGDSVM/','../DBD_SPW_3E-8/']
#cids=['2OOB', '1EFN', '1PPE', '1J2J', '1GL1', '1SYX', '1Z0K', '1AY7', '1FFW', '3SGQ', '1S1Q', '1FLE', '7CEI', '2IDO', '1KTZ', '4CPA', '2UUY', '1R6Q', '1D6R', '1OC0', '1CGI', '1R0R', '1EAW', '1GCQ', '1XD3', '1LFD', '2I25', '1CLV', '1H9D', '1ACB', '2SNI', '3D5S', '1Z5Y', '2HRK', '2ABZ', '1UDI', '1PXV', '2J0T']#E.Pex.keys()[20:40]    
#dirs=['../../SGD_DBD4/']
dirs=['../../DBD4S_SMO196/','../../DBD4S_SGD196/','../../DBD4S_SGD196_CENT/','../../DBD4S_SGDCENTPW71/']#['../../DBD4_NoESR_prop/']
# FIND THE COMMON SET 
cids=None
Re=(r"(\S+)\.",r"(\S+)\#(\S+)\.")
for d in dirs:
    cids_d=[]
    for f in glob.glob(d+'/*.pairpred.txt'):
        cx=re.match(Re[int('#' in f)],getFileParts(f)[1]).groups()
        if len(cx)==1:
            cx=cx[0]
        cids_d.append(cx)
    if cids is None:
        cids=set(cids_d)
    else:
        cids=cids.intersection(cids_d)
cids=list(cids)        
R={}
for cid in cids:
    rfpp=[]
    for d in dirs:    
        fname=d+('#'.join(cid))+'.pairpred.txt'
        try:            
            (auc0,Mv0,Mv,Ml,lseq,rseq,lrV0,lrV,rrV0,rrV)=postProcessAvg(cid,pdbpklpath,d) 
Example #9
0
    except ImportError:
        print "Failure importing MPI4py: Not using MPI parallelization."
        comm = None
        myid = 0
        nprocs = 1
    csize = int(np.ceil(len(fs) / float(nprocs)))
    gclist = list(chunks(fs, csize))
    myfs = gclist[myid]
    LV = []
    TPS = []
    DNTP = []
    dsA = {}
    LVP = []
    for i, ifile in enumerate(myfs):
        try:
            cid = getFileParts(getFileParts(ifile)[1])[1][:4]
            if incids is not None and cid not in incids:
                continue
            print "Processing", cid

            if auconly:
                auc = readFile(ifile, auconly=True)
                dsA[cid] = (auc, np.nan, np.nan)
            else:
                if postprocess:
                    pauc, Mvc0, Mvc, Mlc, lseq, rseq, lrV0, lrV, rrV0, rrV = postProcessAvg(
                        cid, pdbpklpath, loopath)
                    ifile = (pauc, Mvc, Mlc, lseq, rseq, lrV, rrV)
                (auc, ttp, fpi, dntp, la, ra, pp, nn, Mvx, Mlx, lv, ll, rv,
                 rl) = computeNTP(ifile, top=200)  #lv,ll,rv,rl
                TPS.append([ttp, fpi, 100.0 * ttp / pp, pp, nn, pp + nn])
Example #10
0
@author: root
"""
import glob
import numpy as np
from analyzeLOOCV import computeNTP, calcRFPP
dir1='./DBD3LOOCVSR/'
dir2='./DBD3LOOCV/'
fs=glob.glob(dir1+'*.pairpred.txt')
from BISEPutils import getFileParts
F1=np.zeros((len(fs),5))
F2=np.zeros((len(fs),5))
DNTP1=[]
DNTP2=[]
for i,f1 in enumerate(fs):
    
    fp=getFileParts(f1)
    f2=dir2+fp[1]+fp[2]
    (auc,ttp,fpi,dntp,la,ra,pp,nn,Mvx,Mlx)=computeNTP(f1);
    DNTP1.append(dntp)
    print i,f1,auc, ttp, fpi,la,ra
    F1[i,:]=[auc ,ttp ,fpi ,la ,ra]
    (auc,ttp,fpi,dntp,la,ra,pp,nn,Mvx,Mlx)=computeNTP(f2);
    DNTP2.append(dntp)
    print i,f2,auc, ttp, fpi,la,ra
    F2[i,:]=[auc ,ttp ,fpi ,la ,ra]
    
FPI1=F1[:,2]
FPI2=F2[:,2]
print "RFPP for",dir1
calcRFPP(FPI1,DNTP1)
print "RFPP for",dir2
Example #11
0

if __name__ == '__main__':
    LV = []
    TPS = []
    DNTP = []
    #auconly=False # whether to calculate the avg. auc of the complexes or do more
    #(auc,(fp,tp),(A,Rx,D,L,cids,r,dkey))=getAUC('./Results/result_tppk.res.pkl')
    #dAo=dict(zip(cids,A)) #AUCs from the training data set (CV) only
    loopath = './Shandar/data-sets/data-sets/'  #C:\Users\Afsar\Desktop\pairpred\sequence only\DBD3LOOCVSEQ
    fsp = glob.glob(loopath + '*.preds')
    dA = {}
    dsA = {}
    #daoo={}
    for i, ifile in enumerate(fsp):
        cid = getFileParts(ifile)[1]
        print 'cid =', cid, 100 * float(i + 1) / len(fsp), '% done'
        (Ml, Mv) = parseShandarFiles(cid, loopath)
        #(auc,Mv,Ml,lseq,rseq,lrV,rrV)=readFile(ifile,usePDBidx=False);#(auc,Mv,Ml,lseq,rseq,lrV,rrV)

        #(la,lv,ll)=getAUC4Protein(lrV)
        #(ra,rv,rl)=getAUC4Protein(rrV)
        Mvx = Mv.ravel()
        Mlx = Ml.ravel()
        nidx = ~np.isnan(Mvx) & ~np.isnan(Mlx)
        Mvx[~nidx] = -np.inf
        (ttp, fpi, dntp) = findNTPinTop(Mvx, Mlx, Mv.shape, top=500)
        Mvx = Mvx[nidx]
        Mlx = Mlx[nidx]
        pp = np.sum(Mlx == 1)  # total number of positives
        nn = len(Mlx) - pp  #total number of negatives
    return (Mlm,Mvm)
    
if __name__=='__main__':
    LV=[]   
    TPS=[]
    DNTP=[]
    #auconly=False # whether to calculate the avg. auc of the complexes or do more
    #(auc,(fp,tp),(A,Rx,D,L,cids,r,dkey))=getAUC('./Results/result_tppk.res.pkl')
    #dAo=dict(zip(cids,A)) #AUCs from the training data set (CV) only
    loopath='./Shandar/data-sets/data-sets/'#C:\Users\Afsar\Desktop\pairpred\sequence only\DBD3LOOCVSEQ
    fsp=glob.glob(loopath+'*.preds')    
    dA={}
    dsA={}
    #daoo={}
    for i,ifile in enumerate(fsp):
        cid=getFileParts(ifile)[1]
        print 'cid =',cid,100*float(i+1)/len(fsp),'% done'
        (Ml,Mv)=parseShandarFiles(cid,loopath)
        #(auc,Mv,Ml,lseq,rseq,lrV,rrV)=readFile(ifile,usePDBidx=False);#(auc,Mv,Ml,lseq,rseq,lrV,rrV)  
        
        #(la,lv,ll)=getAUC4Protein(lrV)
        #(ra,rv,rl)=getAUC4Protein(rrV)
        Mvx=Mv.ravel()
        Mlx=Ml.ravel()
        nidx=~np.isnan(Mvx) &  ~np.isnan(Mlx)
        Mvx[~nidx]=-np.inf            
        (ttp,fpi,dntp)=findNTPinTop(Mvx,Mlx,Mv.shape,top=500)
        Mvx=Mvx[nidx]
        Mlx=Mlx[nidx]
        pp=np.sum(Mlx==1) # total number of positives
        nn=len(Mlx)-pp #total number of negatives
def parseShandarFiles(ifile,
                      auconly=False,
                      **kwargs):  #(auc,Mv,Ml,lseq,rseq,lrV,rrV)
    """
    Reads shandar's output files with labels (made on the same pattern as analyzePredFile.readFile)
    """
    def parseContLine(ln):
        # ['A', '#5', 'ASN:7', 'N', '::', 'B', '#5', 'HIS:6', 'H:', '0', '53.61']
        #   0   1       2       3     4     5   6       7       8    9      10
        lns = ln.split()
        lidx = lns[0] + lns[1]
        ridx = lns[5] + lns[6]
        lbl = int(lns[9])
        return (lidx, ridx, lbl)

    loopath, cid, _ = getFileParts(ifile)
    lcids = cid.split('_')[1]
    rcids = cid.split('_')[2]
    Mlidx = {}
    Mridx = {}
    Mlv = []
    l = 0
    r = 0
    with open(os.path.join(loopath, cid + '.preds')) as fp, open(
            os.path.join(loopath, cid + '.cont')) as fc:
        for lnp, lnc in zip(fp, fc):
            (lidx, ridx, lbl) = parseContLine(lnc)
            if lidx[0] in lcids and ridx[0] in rcids:
                try:
                    lx = Mlidx[lidx]
                except:
                    Mlidx[lidx] = l
                    lx = l
                    l = l + 1
                try:
                    rx = Mridx[ridx]
                except:
                    Mridx[ridx] = r
                    rx = r
                    r = r + 1
                p = float(lnp)
                Mlv.append((lx, rx, lbl, p))
    Mvm = np.zeros((l, r))
    Mvm.fill(np.nan)
    Mlm = np.zeros((l, r))
    for i in range(len(Mlv)):
        Mlm[Mlv[i][0], Mlv[i][1]] = Mlv[i][2]
        Mvm[Mlv[i][0], Mlv[i][1]] = Mlv[i][3]

    (_, _, auc) = roc.roc(list(Mvm.flatten()), list(Mlm.flatten()))
    if auconly:
        return auc
    #construct lrV,rrV
    lrV = dict(
        zip(range(Mvm.shape[0]), zip(np.max(Mvm, axis=1), np.max(Mlm,
                                                                 axis=1))))
    rrV = dict(
        zip(range(Mvm.shape[1]), zip(np.max(Mvm, axis=0), np.max(Mlm,
                                                                 axis=0))))

    return auc, Mvm, Mlm, None, None, lrV, rrV