def plotROC(self, filename=None, fold = None, **args) :

        rocN = None
        if 'rocN' in args :
            rocN = args['rocN']
        if self.numFolds == 1 :
            # if the results are for a single split
            labels = self.getGivenClass()
            dvals = self.getDecisionFunction()
            rocFP, rocTP, area = roc_module.roc(dvals, labels, rocN)
        elif fold is None :
            # get an averaged ROC curve
            labels = self.getGivenClass()
            dvals = self.getDecisionFunction()
            folds = [(dvals[i], labels[i]) for i in range(len(labels))]
            rocFP, rocTP, area = roc_module.roc_VA(folds, rocN)
        else :
            # plot an ROC plot for the given fold
            if fold > self.numFolds :
                raise ValueError, 'foldNum too large'
            labels = self.getGivenClass(fold)
            dvals = self.getDecisionFunction(fold)
            rocFP, rocTP, area = roc_module.roc(dvals, labels, rocN)
        roc_module.plotROC(rocFP, rocTP, filename)
Beispiel #2
0
def getAUC(s):
    if type(s) == type(''):
        (r, dkey) = cPickle.load(open(s, "rb"))
    else:
        (r, dkey) = s

    patid = combineList(r.getPatternID())
    vkey = dict(zip(patid, range(len(patid))))
    decfn = combineList(r.getDecisionFunction())
    lblid = combineList(r.getGivenLabels())
    cids = dkey.keys()
    D = [[] for i in cids]
    L = [[] for i in cids]
    A = [[] for i in cids]
    try:
        R = getRMSDDict('shandar_rmsd.txt')
    except:
        R = None
    Rx = [[] for i in cids]
    for i, cid in enumerate(cids):
        cidx = dkey[cid]
        if type(cidx) is tuple:  #backward compatability to old results objects
            cidx = cidx[0]
        for e in cidx:
            try:
                n = vkey[e]
            except KeyError:
                pdb.set_trace()
            D[i].append(decfn[n])
            L[i].append(lblid[n])
        (_, _, a) = roc.roc(D[i], L[i])
        A[i] = a
        if R is not None:
            Rx[i] = R[cid]
    (fp, tp, auc) = roc.roc_VA(zip(D, L))
    return (auc, (fp, tp), (A, Rx, D, L, cids, r, dkey))
    def plotROC(self, filename=None, fold = None, **args) :

        rocN = None
        if 'rocN' in args :
            rocN = args['rocN']
        if self.numFolds == 1 :
            # if the results are for a single split
            labels = self.getGivenClass()
            dvals = self.getDecisionFunction()
            rocFP, rocTP, area = roc_module.roc(dvals, labels, rocN)
        elif fold is None :
            # get an averaged ROC curve
            labels = self.getGivenClass()
            dvals = self.getDecisionFunction()
            folds = [(dvals[i], labels[i]) for i in range(len(labels))]
            rocFP, rocTP, area = roc_module.roc_VA(folds, rocN)
        else :
            # plot an ROC plot for the given fold
            if fold > self.numFolds :
                raise ValueError, 'foldNum too large'
            labels = self.getGivenClass(fold)
            dvals = self.getDecisionFunction(fold)
            rocFP, rocTP, area = roc_module.roc(dvals, labels, rocN)
        roc_module.plotROC(rocFP, rocTP, filename, **args)
Beispiel #4
0
def getAUC(s):
    if type(s)==type(''):
        (r,dkey)=cPickle.load(open(s, "rb" ) )
    else:
        (r,dkey)=s
 
    patid=combineList(r.getPatternID())
    vkey=dict(zip(patid,range(len(patid))))
    decfn=combineList(r.getDecisionFunction())
    lblid=combineList(r.getGivenLabels())
    cids=dkey.keys()
    D=[[] for i in cids]
    L=[[] for i in cids]
    A=[[] for i in cids]
    try:
        R=getRMSDDict('shandar_rmsd.txt')
    except:
        R=None
    Rx=[[] for i in cids]
    for i,cid in enumerate(cids):
        cidx=dkey[cid]        
        if type(cidx) is tuple: #backward compatability to old results objects 
            cidx=cidx[0]
        for e in cidx:
            try:
                n=vkey[e]
            except KeyError:
                pdb.set_trace()
            D[i].append(decfn[n])
            L[i].append(lblid[n])
        (_,_,a)=roc.roc(D[i],L[i])
        A[i]=a
        if R is not None:
            Rx[i]=R[cid]        
    (fp,tp,auc)=roc.roc_VA(zip(D,L))
    return (auc,(fp,tp),(A,Rx,D,L,cids,r,dkey))
 if(myid==0):
     dsAr=[dsA]
     for p in range(1,nprocs):
         (DNTP_p,dsA_p,LV_p,LVP_p,TPS_p)=comm.recv(source=p)
         dsAr.append(dsA_p)
         DNTP.extend(DNTP_p)
         LV.extend(LV_p)
         LVP.extend(LVP_p)
         TPS.extend(TPS_p)
     dsA=mergeDicts(dsAr)
     print 'Number of complexes',len(dsA)
     #print 'Complex wise AUC = ',np.mean(dA.values())
     p12=map(list,zip(*dsA.values()));pa=p12[0];p1=p12[1];p2=p12[2];ps=p1;ps.extend(p2);
     print 'Complex Wise AUC =',np.mean(pa),'Protein Wise AUC =',np.mean(ps)  
     if not auconly:
         (fplv,tplv,auclv)=roc.roc_VA(LV) 
         (fplvp,tplvp,auclvp)=roc.roc_VA(LVP) 
         mkl.save(ofname,((fplv,tplv,auclv),(fplvp,tplvp,auclvp)))
         print "AUC = ",auclv
         """        
             plt.hist(np.array(DNTP).flatten(),[0,1,2,3,4,5,6,1000],cumulative=True);plt.grid();plt.xlabel('sequence distance');plt.ylabel('counts');plt.title('Number of top 200 predictions vs. sequence distance from nearest true positive');plt.show()
             [np.sum(dn<2.0) for dn in DNTP]
             cids=[getFileParts(getFileParts(ifile)[1])[1] for ifile in fs]
             [dsA[cid] for cid in cids]
             [dAo[cid] for cid in cids]
         """ 
         #DISTRIBUTION PLOT
         dthresh=[0,1,2,3,4] # sequence distance threshold    
         XX=calcRFPP(np.array(TPS)[:,1]+1,DNTP,dthresh=dthresh)
         if doplot:
             plt.figure();plt.plot(fplv,tplv);plt.xlabel('FP');plt.ylabel('TP');plt.grid();plt.title('ROC Curve: AUC =  %1.2f' % (auclv*100))
Beispiel #6
0
    for i in range(len(L.R)):
        if u2b[i] is not np.nan:
            rasa[i]=xasa[u2b[i]]
    return (lasa,rasa)
if __name__=="__main__":
    fname='../Results/result_tppk.res.pkl'
    pdbpklpath='../DBD4N/PDBPKL4'
    (auc,(fp,tp),(A,Rx,Dx,Lx,cids,r,dkey))=getAUC(fname)
    cids=cids
    F=[[] for c in cids]
    for i,cid in enumerate(cids):
        print 'Processing',cid
        L=myPDB.loader(os.path.join(pdbpklpath,cid+'_l_u.pdb.pkl'))
        R=myPDB.loader(os.path.join(pdbpklpath,cid+'_r_u.pdb.pkl'))
        Lb=myPDB.loader(os.path.join(pdbpklpath,cid+'_l_b.pdb.pkl'))
        Rb=myPDB.loader(os.path.join(pdbpklpath,cid+'_r_b.pdb.pkl'))
        lurasa,lbrasa=getDASA(L,Lb)
        rurasa,rbrasa=getDASA(R,Rb)
        ldasa=np.abs(lurasa-lbrasa)
        rdasa=np.abs(rurasa-rbrasa)
        for (_,(lidx,ridx)) in dkey[cid][1]:
            f=ldasa[lidx]+rdasa[ridx]#L.rASA[lidx]+R.rASA[ridx]#len(L.S[0][lidx])+len(R.S[0][ridx])#L.psaiaf['rhph'][lidx]+R.psaiaf['rhph'][ridx]#-(L.RD[0,lidx]+L.RD[1,lidx]+R.RD[0,ridx]+R.RD[1,ridx])
            F[i].append(f)
    
    (fp,tp,auc)=roc.roc_VA(zip(F,Lx))    
    #plt.plot(fp,tp);plt.xlabel('FPR');plt.ylabel('TPR');plt.title('ROC for $\Delta$rASA. AUC = '+str(auc));plt.ylim([0,1]);plt.show()
    Fx=list(itertools.chain(*F))
    Dxx=list(itertools.chain(*Dx))
    Lxx=list(itertools.chain(*Lx))
    nidx= ~np.isnan(Fx)    
    rasaPlot(np.array(Fx)[nidx],np.array(Dxx)[nidx],np.array(Lxx)[nidx],Np=10)
Beispiel #7
0
         #aucoo=readFile('./DBD3LOOCV/'+getFileParts(ifile)[1]+getFileParts(ifile)[2],auconly=True)
         #daoo[cid]=aucoo
     else:
         (auc,ttp,fpi,dntp,la,ra,pp,nn,Mvx,Mlx)=computeNTP(ifile,top=200)            
         TPS.append([ttp,fpi,100.0*ttp/pp,pp,nn,pp+nn])
         DNTP.append(dntp)
         LV.append((list(Mvx),list(Mlx)))       
         dsA[cid]=(auc,la,ra)            
     dA[cid]=auc
     #print cid,auc,dAo[cid]
 print 'Number of complexes',len(dA)
 print 'Complex wise AUC = ',np.mean(dA.values()),'AUC for reduced set = ',np.mean([dAo[k] for k in dA.keys() if k in dAo.keys()])
 if not auconly:
     p12=map(list,zip(*dsA.values()));pa=p12[0];p1=p12[1];p2=p12[2];ps=p1;ps.extend(p2);
     print 'Complex Wise AUC =',np.mean(pa),'Protein Wise AUC =',np.mean(ps)  
     #ROC CURVE
     (fplv,tplv,auclv)=roc.roc_VA(LV)
     plt.figure();plt.plot(fplv,tplv);plt.xlabel('FP');plt.ylabel('TP');plt.grid();plt.title('ROC Curve: AUC =  %1.2f' % (auclv*100))
     """        
         plt.hist(np.array(DNTP).flatten(),[0,1,2,3,4,5,6,1000],cumulative=True);plt.grid();plt.xlabel('sequence distance');plt.ylabel('counts');plt.title('Number of top 200 predictions vs. sequence distance from nearest true positive');plt.show()
         [np.sum(dn<2.0) for dn in DNTP]
         cids=[getFileParts(getFileParts(ifile)[1])[1] for ifile in fs]
         [dsA[cid] for cid in cids]
         [dAo[cid] for cid in cids]
     """ 
     #DISTRIBUTION PLOT
     dthresh=[0,1,2,3,4] # sequence distance threshold    
     XX=calcRFPP(np.array(TPS)[:,1]+1,DNTP,dthresh=dthresh)
     plt.figure();plt.boxplot(tuple(XX),bootstrap=1000,positions=dthresh);plt.xlabel('Sequence Distance (D) from a TP'); plt.ylabel('Minimum rank of a prediction within distance D of a TP' );plt.title('Results of soft sequence distance threshold');plt.grid();plt.yticks(range(0,201,10));
     plt.show() 
    
Beispiel #8
0
     LV.extend(LV_p)
     LVP.extend(LVP_p)
     TPS.extend(TPS_p)
 dsA = mergeDicts(dsAr)
 print 'Number of complexes', len(dsA)
 #print 'Complex wise AUC = ',np.mean(dA.values())
 p12 = map(list, zip(*dsA.values()))
 pa = p12[0]
 p1 = p12[1]
 p2 = p12[2]
 ps = p1
 ps.extend(p2)
 print 'Complex Wise AUC =', np.mean(pa), 'Protein Wise AUC =', np.mean(
     ps)
 if not auconly:
     (fplv, tplv, auclv) = roc.roc_VA(LV)
     (fplvp, tplvp, auclvp) = roc.roc_VA(LVP)
     mkl.save(ofname, ((fplv, tplv, auclv), (fplvp, tplvp, auclvp)))
     print "AUC = ", auclv
     """        
         plt.hist(np.array(DNTP).flatten(),[0,1,2,3,4,5,6,1000],cumulative=True);plt.grid();plt.xlabel('sequence distance');plt.ylabel('counts');plt.title('Number of top 200 predictions vs. sequence distance from nearest true positive');plt.show()
         [np.sum(dn<2.0) for dn in DNTP]
         cids=[getFileParts(getFileParts(ifile)[1])[1] for ifile in fs]
         [dsA[cid] for cid in cids]
         [dAo[cid] for cid in cids]
     """
     #DISTRIBUTION PLOT
     dthresh = [0, 1, 2, 3, 4]  # sequence distance threshold
     XX = calcRFPP(np.array(TPS)[:, 1] + 1, DNTP, dthresh=dthresh)
     if doplot:
         plt.figure()
Beispiel #9
0
     #print cid,auc,dAo[cid]
 print 'Number of complexes', len(dA)
 print 'Complex wise AUC = ', np.mean(
     dA.values()), 'AUC for reduced set = ', np.mean(
         [dAo[k] for k in dA.keys() if k in dAo.keys()])
 if not auconly:
     p12 = map(list, zip(*dsA.values()))
     pa = p12[0]
     p1 = p12[1]
     p2 = p12[2]
     ps = p1
     ps.extend(p2)
     print 'Complex Wise AUC =', np.mean(pa), 'Protein Wise AUC =', np.mean(
         ps)
     #ROC CURVE
     (fplv, tplv, auclv) = roc.roc_VA(LV)
     plt.figure()
     plt.plot(fplv, tplv)
     plt.xlabel('FP')
     plt.ylabel('TP')
     plt.grid()
     plt.title('ROC Curve: AUC =  %1.2f' % (auclv * 100))
     """        
         plt.hist(np.array(DNTP).flatten(),[0,1,2,3,4,5,6,1000],cumulative=True);plt.grid();plt.xlabel('sequence distance');plt.ylabel('counts');plt.title('Number of top 200 predictions vs. sequence distance from nearest true positive');plt.show()
         [np.sum(dn<2.0) for dn in DNTP]
         cids=[getFileParts(getFileParts(ifile)[1])[1] for ifile in fs]
         [dsA[cid] for cid in cids]
         [dAo[cid] for cid in cids]
     """
     #DISTRIBUTION PLOT
     dthresh = [0, 1, 2, 3, 4]  # sequence distance threshold
            folds = []
            s = svm.SVM(C=C)
            s.train(trainData)

            # testData = SparseDataSet(testFeatureFile);
            testData = demo_utils.get_spectrum_data(testSeqFile, k1, k2, testLen, testLen, True)

            results = s.test(testData)
            labels = results.getGivenClass()
            dvals = results.getDecisionFunction()
            folds.append((dvals, labels))

            demo_utils.print_results(results)
            print "Results Log: "
            results.getLog()
            fpc, tpc, area = roc_mod.roc_VA(folds, None)
            print "Area: " + str(area)
            if area > bestAUC:
                bestAUC = area
                bestFP = fpc
                bestTP = tpc
                bestC = C
                ofile = open("roc%s.txt" % (str(C)), "w")
                ofile.write("area: " + str(area) + "\n")
                ofile.write("bestFP: " + str(bestFP) + "\n")
                ofile.write("bestTP: " + str(bestTP) + "\n")
                ofile.write("bestC: " + str(bestC) + "\n")
                ofile.close()
                print (
                    "area: %s, bestFP: %s, bestTP: %s, bestC: %s" % (str(area), str(bestFP), str(bestTP), str(bestC))
                )