def plotROC(self, filename=None, fold = None, **args) : rocN = None if 'rocN' in args : rocN = args['rocN'] if self.numFolds == 1 : # if the results are for a single split labels = self.getGivenClass() dvals = self.getDecisionFunction() rocFP, rocTP, area = roc_module.roc(dvals, labels, rocN) elif fold is None : # get an averaged ROC curve labels = self.getGivenClass() dvals = self.getDecisionFunction() folds = [(dvals[i], labels[i]) for i in range(len(labels))] rocFP, rocTP, area = roc_module.roc_VA(folds, rocN) else : # plot an ROC plot for the given fold if fold > self.numFolds : raise ValueError, 'foldNum too large' labels = self.getGivenClass(fold) dvals = self.getDecisionFunction(fold) rocFP, rocTP, area = roc_module.roc(dvals, labels, rocN) roc_module.plotROC(rocFP, rocTP, filename)
def getAUC(s): if type(s) == type(''): (r, dkey) = cPickle.load(open(s, "rb")) else: (r, dkey) = s patid = combineList(r.getPatternID()) vkey = dict(zip(patid, range(len(patid)))) decfn = combineList(r.getDecisionFunction()) lblid = combineList(r.getGivenLabels()) cids = dkey.keys() D = [[] for i in cids] L = [[] for i in cids] A = [[] for i in cids] try: R = getRMSDDict('shandar_rmsd.txt') except: R = None Rx = [[] for i in cids] for i, cid in enumerate(cids): cidx = dkey[cid] if type(cidx) is tuple: #backward compatability to old results objects cidx = cidx[0] for e in cidx: try: n = vkey[e] except KeyError: pdb.set_trace() D[i].append(decfn[n]) L[i].append(lblid[n]) (_, _, a) = roc.roc(D[i], L[i]) A[i] = a if R is not None: Rx[i] = R[cid] (fp, tp, auc) = roc.roc_VA(zip(D, L)) return (auc, (fp, tp), (A, Rx, D, L, cids, r, dkey))
def plotROC(self, filename=None, fold = None, **args) : rocN = None if 'rocN' in args : rocN = args['rocN'] if self.numFolds == 1 : # if the results are for a single split labels = self.getGivenClass() dvals = self.getDecisionFunction() rocFP, rocTP, area = roc_module.roc(dvals, labels, rocN) elif fold is None : # get an averaged ROC curve labels = self.getGivenClass() dvals = self.getDecisionFunction() folds = [(dvals[i], labels[i]) for i in range(len(labels))] rocFP, rocTP, area = roc_module.roc_VA(folds, rocN) else : # plot an ROC plot for the given fold if fold > self.numFolds : raise ValueError, 'foldNum too large' labels = self.getGivenClass(fold) dvals = self.getDecisionFunction(fold) rocFP, rocTP, area = roc_module.roc(dvals, labels, rocN) roc_module.plotROC(rocFP, rocTP, filename, **args)
def getAUC(s): if type(s)==type(''): (r,dkey)=cPickle.load(open(s, "rb" ) ) else: (r,dkey)=s patid=combineList(r.getPatternID()) vkey=dict(zip(patid,range(len(patid)))) decfn=combineList(r.getDecisionFunction()) lblid=combineList(r.getGivenLabels()) cids=dkey.keys() D=[[] for i in cids] L=[[] for i in cids] A=[[] for i in cids] try: R=getRMSDDict('shandar_rmsd.txt') except: R=None Rx=[[] for i in cids] for i,cid in enumerate(cids): cidx=dkey[cid] if type(cidx) is tuple: #backward compatability to old results objects cidx=cidx[0] for e in cidx: try: n=vkey[e] except KeyError: pdb.set_trace() D[i].append(decfn[n]) L[i].append(lblid[n]) (_,_,a)=roc.roc(D[i],L[i]) A[i]=a if R is not None: Rx[i]=R[cid] (fp,tp,auc)=roc.roc_VA(zip(D,L)) return (auc,(fp,tp),(A,Rx,D,L,cids,r,dkey))
if(myid==0): dsAr=[dsA] for p in range(1,nprocs): (DNTP_p,dsA_p,LV_p,LVP_p,TPS_p)=comm.recv(source=p) dsAr.append(dsA_p) DNTP.extend(DNTP_p) LV.extend(LV_p) LVP.extend(LVP_p) TPS.extend(TPS_p) dsA=mergeDicts(dsAr) print 'Number of complexes',len(dsA) #print 'Complex wise AUC = ',np.mean(dA.values()) p12=map(list,zip(*dsA.values()));pa=p12[0];p1=p12[1];p2=p12[2];ps=p1;ps.extend(p2); print 'Complex Wise AUC =',np.mean(pa),'Protein Wise AUC =',np.mean(ps) if not auconly: (fplv,tplv,auclv)=roc.roc_VA(LV) (fplvp,tplvp,auclvp)=roc.roc_VA(LVP) mkl.save(ofname,((fplv,tplv,auclv),(fplvp,tplvp,auclvp))) print "AUC = ",auclv """ plt.hist(np.array(DNTP).flatten(),[0,1,2,3,4,5,6,1000],cumulative=True);plt.grid();plt.xlabel('sequence distance');plt.ylabel('counts');plt.title('Number of top 200 predictions vs. sequence distance from nearest true positive');plt.show() [np.sum(dn<2.0) for dn in DNTP] cids=[getFileParts(getFileParts(ifile)[1])[1] for ifile in fs] [dsA[cid] for cid in cids] [dAo[cid] for cid in cids] """ #DISTRIBUTION PLOT dthresh=[0,1,2,3,4] # sequence distance threshold XX=calcRFPP(np.array(TPS)[:,1]+1,DNTP,dthresh=dthresh) if doplot: plt.figure();plt.plot(fplv,tplv);plt.xlabel('FP');plt.ylabel('TP');plt.grid();plt.title('ROC Curve: AUC = %1.2f' % (auclv*100))
for i in range(len(L.R)): if u2b[i] is not np.nan: rasa[i]=xasa[u2b[i]] return (lasa,rasa) if __name__=="__main__": fname='../Results/result_tppk.res.pkl' pdbpklpath='../DBD4N/PDBPKL4' (auc,(fp,tp),(A,Rx,Dx,Lx,cids,r,dkey))=getAUC(fname) cids=cids F=[[] for c in cids] for i,cid in enumerate(cids): print 'Processing',cid L=myPDB.loader(os.path.join(pdbpklpath,cid+'_l_u.pdb.pkl')) R=myPDB.loader(os.path.join(pdbpklpath,cid+'_r_u.pdb.pkl')) Lb=myPDB.loader(os.path.join(pdbpklpath,cid+'_l_b.pdb.pkl')) Rb=myPDB.loader(os.path.join(pdbpklpath,cid+'_r_b.pdb.pkl')) lurasa,lbrasa=getDASA(L,Lb) rurasa,rbrasa=getDASA(R,Rb) ldasa=np.abs(lurasa-lbrasa) rdasa=np.abs(rurasa-rbrasa) for (_,(lidx,ridx)) in dkey[cid][1]: f=ldasa[lidx]+rdasa[ridx]#L.rASA[lidx]+R.rASA[ridx]#len(L.S[0][lidx])+len(R.S[0][ridx])#L.psaiaf['rhph'][lidx]+R.psaiaf['rhph'][ridx]#-(L.RD[0,lidx]+L.RD[1,lidx]+R.RD[0,ridx]+R.RD[1,ridx]) F[i].append(f) (fp,tp,auc)=roc.roc_VA(zip(F,Lx)) #plt.plot(fp,tp);plt.xlabel('FPR');plt.ylabel('TPR');plt.title('ROC for $\Delta$rASA. AUC = '+str(auc));plt.ylim([0,1]);plt.show() Fx=list(itertools.chain(*F)) Dxx=list(itertools.chain(*Dx)) Lxx=list(itertools.chain(*Lx)) nidx= ~np.isnan(Fx) rasaPlot(np.array(Fx)[nidx],np.array(Dxx)[nidx],np.array(Lxx)[nidx],Np=10)
#aucoo=readFile('./DBD3LOOCV/'+getFileParts(ifile)[1]+getFileParts(ifile)[2],auconly=True) #daoo[cid]=aucoo else: (auc,ttp,fpi,dntp,la,ra,pp,nn,Mvx,Mlx)=computeNTP(ifile,top=200) TPS.append([ttp,fpi,100.0*ttp/pp,pp,nn,pp+nn]) DNTP.append(dntp) LV.append((list(Mvx),list(Mlx))) dsA[cid]=(auc,la,ra) dA[cid]=auc #print cid,auc,dAo[cid] print 'Number of complexes',len(dA) print 'Complex wise AUC = ',np.mean(dA.values()),'AUC for reduced set = ',np.mean([dAo[k] for k in dA.keys() if k in dAo.keys()]) if not auconly: p12=map(list,zip(*dsA.values()));pa=p12[0];p1=p12[1];p2=p12[2];ps=p1;ps.extend(p2); print 'Complex Wise AUC =',np.mean(pa),'Protein Wise AUC =',np.mean(ps) #ROC CURVE (fplv,tplv,auclv)=roc.roc_VA(LV) plt.figure();plt.plot(fplv,tplv);plt.xlabel('FP');plt.ylabel('TP');plt.grid();plt.title('ROC Curve: AUC = %1.2f' % (auclv*100)) """ plt.hist(np.array(DNTP).flatten(),[0,1,2,3,4,5,6,1000],cumulative=True);plt.grid();plt.xlabel('sequence distance');plt.ylabel('counts');plt.title('Number of top 200 predictions vs. sequence distance from nearest true positive');plt.show() [np.sum(dn<2.0) for dn in DNTP] cids=[getFileParts(getFileParts(ifile)[1])[1] for ifile in fs] [dsA[cid] for cid in cids] [dAo[cid] for cid in cids] """ #DISTRIBUTION PLOT dthresh=[0,1,2,3,4] # sequence distance threshold XX=calcRFPP(np.array(TPS)[:,1]+1,DNTP,dthresh=dthresh) plt.figure();plt.boxplot(tuple(XX),bootstrap=1000,positions=dthresh);plt.xlabel('Sequence Distance (D) from a TP'); plt.ylabel('Minimum rank of a prediction within distance D of a TP' );plt.title('Results of soft sequence distance threshold');plt.grid();plt.yticks(range(0,201,10)); plt.show()
LV.extend(LV_p) LVP.extend(LVP_p) TPS.extend(TPS_p) dsA = mergeDicts(dsAr) print 'Number of complexes', len(dsA) #print 'Complex wise AUC = ',np.mean(dA.values()) p12 = map(list, zip(*dsA.values())) pa = p12[0] p1 = p12[1] p2 = p12[2] ps = p1 ps.extend(p2) print 'Complex Wise AUC =', np.mean(pa), 'Protein Wise AUC =', np.mean( ps) if not auconly: (fplv, tplv, auclv) = roc.roc_VA(LV) (fplvp, tplvp, auclvp) = roc.roc_VA(LVP) mkl.save(ofname, ((fplv, tplv, auclv), (fplvp, tplvp, auclvp))) print "AUC = ", auclv """ plt.hist(np.array(DNTP).flatten(),[0,1,2,3,4,5,6,1000],cumulative=True);plt.grid();plt.xlabel('sequence distance');plt.ylabel('counts');plt.title('Number of top 200 predictions vs. sequence distance from nearest true positive');plt.show() [np.sum(dn<2.0) for dn in DNTP] cids=[getFileParts(getFileParts(ifile)[1])[1] for ifile in fs] [dsA[cid] for cid in cids] [dAo[cid] for cid in cids] """ #DISTRIBUTION PLOT dthresh = [0, 1, 2, 3, 4] # sequence distance threshold XX = calcRFPP(np.array(TPS)[:, 1] + 1, DNTP, dthresh=dthresh) if doplot: plt.figure()
#print cid,auc,dAo[cid] print 'Number of complexes', len(dA) print 'Complex wise AUC = ', np.mean( dA.values()), 'AUC for reduced set = ', np.mean( [dAo[k] for k in dA.keys() if k in dAo.keys()]) if not auconly: p12 = map(list, zip(*dsA.values())) pa = p12[0] p1 = p12[1] p2 = p12[2] ps = p1 ps.extend(p2) print 'Complex Wise AUC =', np.mean(pa), 'Protein Wise AUC =', np.mean( ps) #ROC CURVE (fplv, tplv, auclv) = roc.roc_VA(LV) plt.figure() plt.plot(fplv, tplv) plt.xlabel('FP') plt.ylabel('TP') plt.grid() plt.title('ROC Curve: AUC = %1.2f' % (auclv * 100)) """ plt.hist(np.array(DNTP).flatten(),[0,1,2,3,4,5,6,1000],cumulative=True);plt.grid();plt.xlabel('sequence distance');plt.ylabel('counts');plt.title('Number of top 200 predictions vs. sequence distance from nearest true positive');plt.show() [np.sum(dn<2.0) for dn in DNTP] cids=[getFileParts(getFileParts(ifile)[1])[1] for ifile in fs] [dsA[cid] for cid in cids] [dAo[cid] for cid in cids] """ #DISTRIBUTION PLOT dthresh = [0, 1, 2, 3, 4] # sequence distance threshold
folds = [] s = svm.SVM(C=C) s.train(trainData) # testData = SparseDataSet(testFeatureFile); testData = demo_utils.get_spectrum_data(testSeqFile, k1, k2, testLen, testLen, True) results = s.test(testData) labels = results.getGivenClass() dvals = results.getDecisionFunction() folds.append((dvals, labels)) demo_utils.print_results(results) print "Results Log: " results.getLog() fpc, tpc, area = roc_mod.roc_VA(folds, None) print "Area: " + str(area) if area > bestAUC: bestAUC = area bestFP = fpc bestTP = tpc bestC = C ofile = open("roc%s.txt" % (str(C)), "w") ofile.write("area: " + str(area) + "\n") ofile.write("bestFP: " + str(bestFP) + "\n") ofile.write("bestTP: " + str(bestTP) + "\n") ofile.write("bestC: " + str(bestC) + "\n") ofile.close() print ( "area: %s, bestFP: %s, bestTP: %s, bestC: %s" % (str(area), str(bestFP), str(bestTP), str(bestC)) )