def plotBinomials(dataMatrix,nVals,p): nTrials = nVals.size # rows are the trials # same the mean and variances... means = np.zeros(nTrials) varReal = np.zeros(nTrials) varDist = np.zeros(nTrials) for i,n in enumerate(nVals): means[i],varReal[i],varDist[i] =\ plotSingleHist(n,p,dataMatrix[i,:],outDir) # plot the means and variances fig = pPlotUtil.figure() plt.subplot(1,2,1) plt.title("Mean of g(xBar)-g(mu) approaches 0",fontsize=fontsize) expMean = 0 plt.plot(nVals,means,'ko',label="Actual Mean") plt.axhline(expMean,color='b',linestyle='--', label="Expected Mean: {:.2g}".format(expMean)) plt.ylim(-min(means),max(means)*1.1) plt.xlabel("Value of n for binomial",fontsize=fontsize) plt.ylabel("Value of g(xBar)-g(mu)",fontsize=fontsize) plt.legend(fontsize=fontsize) pPlotUtil.tickAxisFont() plt.subplot(1,2,2) plt.semilogy(nVals,varReal,'ko',label="Actual Variance") plt.semilogy(nVals,varDist,'b--',label="Expected Variance") plt.title("Variance of g(xBar)-g(mu)\n approaches expected", fontsize=fontsize) plt.xlabel("Value of n for binomial",fontsize=fontsize) plt.ylabel("Value of g(xBar) variance",fontsize=fontsize) pPlotUtil.tickAxisFont() plt.legend(fontsize=fontsize) pPlotUtil.savefig(fig,outDir + "MeanVar")
def plotAlignments(outDir,alignments,scoreOptimal,label): fontSize = 25 scores = [ getAlignScore(a) for a in alignments ] fig = pPlotUtil.figure(xSize=24,ySize=12) plt.subplot(1,2,1) meanScore,stdevScore,bins = plotScoreHistograms(scores,fontSize,'k') plt.title("Shuffled DNA alignment Histogram", fontsize=fontSize) pdfFunc = norm(loc=meanScore,scale=stdevScore).pdf(bins) plotPDF = lambda : plt.plot(bins,pdfFunc,'g--',linewidth=3.0, label="Normal(mean,var)") plotPDF() plt.legend(fontsize=fontSize) ax = plt.subplot(1,2,2) plotScoreHistograms(scores,fontSize) plotPDF() zScore = (scoreOptimal-meanScore)/stdevScore print("Z Score for {:s} is {:.2f}".format(label,zScore)) # ??? is this the real p Value? Dont think so extrProb = 1-norm().cdf(zScore) plt.title(("Histogram of optimal alignment score for {:d} trials\n" + "Optimal score: {:d}*sigma from shuffled mean.\n" "P(shuffled score>=optimal) ~ {:.5g}").\ format(len(scores),int(zScore),extrProb),fontsize=fontSize) plt.axvline(scoreOptimal,color='r',linestyle='--', label="Optimal global alignment score using {:s}: {:d}".\ format(label,int(scoreOptimal))) plt.legend(loc='best',fontsize=fontSize) pPlotUtil.savefig(fig,outDir+ "q2Histograms" + label)
def plotSingleHist(n,p,xTrials,outDir): # coverage is just a plotting artifact fig = pPlotUtil.figure() # mu: expected value of Binomial(n,p) # effectie variance dist,distMean,distVar,normalStd,normalDist,xVals,nBins = \ getDeltaModelDistr(n,p,xTrials) normV = normHist(dist,nBins,\ label=("Actual Distr: Mean={:.4f},Stdev={:.4f}").\ format(distMean,np.sqrt(distVar))) rawPDF = normalDist.pdf(xVals) plt.plot(xVals,rawPDF,'r--',linewidth=5.0, label="Theorertical Distr: Stdev={:.4f}".\ format(normalStd)) plt.title("Histogram for g(xBar)-g(mu) for n={:d},p={:.2f}".\ format(int(n),p),fontsize=g_title) plt.xlabel("(g(Xbar)-g(mu)) ~ Normal(0,[g'(x)*sigma]^2/n)", fontsize=g_label) plt.ylabel("Proportion",fontsize=g_label) plt.legend(frameon=False) pPlotUtil.tickAxisFont() catArr = list(rawPDF) + list(normV) plt.ylim([0,max(catArr)*1.2]) plt.xlim([-max(nBins),max(nBins)]) pPlotUtil.tickAxisFont() pPlotUtil.savefig(fig,outDir + "trial_n{:d}".format(int(n))) #return the statistics for plotting return distMean,distVar,normalStd**2
def plotSingleHist(n, p, xTrials, outDir): # coverage is just a plotting artifact fig = pPlotUtil.figure() # mu: expected value of Binomial(n,p) # effectie variance dist,distMean,distVar,normalStd,normalDist,xVals,nBins = \ getDeltaModelDistr(n,p,xTrials) normV = normHist(dist,nBins,\ label=("Actual Distr: Mean={:.4f},Stdev={:.4f}").\ format(distMean,np.sqrt(distVar))) rawPDF = normalDist.pdf(xVals) plt.plot(xVals,rawPDF,'r--',linewidth=5.0, label="Theorertical Distr: Stdev={:.4f}".\ format(normalStd)) plt.title("Histogram for g(xBar)-g(mu) for n={:d},p={:.2f}".\ format(int(n),p),fontsize=g_title) plt.xlabel("(g(Xbar)-g(mu)) ~ Normal(0,[g'(x)*sigma]^2/n)", fontsize=g_label) plt.ylabel("Proportion", fontsize=g_label) plt.legend(frameon=False) pPlotUtil.tickAxisFont() catArr = list(rawPDF) + list(normV) plt.ylim([0, max(catArr) * 1.2]) plt.xlim([-max(nBins), max(nBins)]) pPlotUtil.tickAxisFont() pPlotUtil.savefig(fig, outDir + "trial_n{:d}".format(int(n))) #return the statistics for plotting return distMean, distVar, normalStd**2
def plotBinomials(dataMatrix, nVals, p): nTrials = nVals.size # rows are the trials # same the mean and variances... means = np.zeros(nTrials) varReal = np.zeros(nTrials) varDist = np.zeros(nTrials) for i, n in enumerate(nVals): means[i],varReal[i],varDist[i] =\ plotSingleHist(n,p,dataMatrix[i,:],outDir) # plot the means and variances fig = pPlotUtil.figure() plt.subplot(1, 2, 1) plt.title("Mean of g(xBar)-g(mu) approaches 0", fontsize=fontsize) expMean = 0 plt.plot(nVals, means, 'ko', label="Actual Mean") plt.axhline(expMean, color='b', linestyle='--', label="Expected Mean: {:.2g}".format(expMean)) plt.ylim(-min(means), max(means) * 1.1) plt.xlabel("Value of n for binomial", fontsize=fontsize) plt.ylabel("Value of g(xBar)-g(mu)", fontsize=fontsize) plt.legend(fontsize=fontsize) pPlotUtil.tickAxisFont() plt.subplot(1, 2, 2) plt.semilogy(nVals, varReal, 'ko', label="Actual Variance") plt.semilogy(nVals, varDist, 'b--', label="Expected Variance") plt.title("Variance of g(xBar)-g(mu)\n approaches expected", fontsize=fontsize) plt.xlabel("Value of n for binomial", fontsize=fontsize) plt.ylabel("Value of g(xBar) variance", fontsize=fontsize) pPlotUtil.tickAxisFont() plt.legend(fontsize=fontsize) pPlotUtil.savefig(fig, outDir + "MeanVar")
def plotErrorAnalysis(mean,std,params,labels,fullOutput): rowsPerPlot = min(4,len(mean)) fig = pPlotUtil.figure(xSize=rowsPerPlot*6,ySize=len(mean)*4) nTrials = len(mean) colors = pPlotUtil.cmap(nTrials) minP = min([ min(p) for p in params] ) maxP = max([ max(p) for p in params] ) lowerAcc = min([min(acc.flatten()) for acc in mean]) lowerBounds = [(meanV[:,1]-stdV[:,1]) for meanV,stdV in zip(mean,std) ] validLowerBound = np.array([np.max(bound) for bound in lowerBounds ]) bestIdx = np.array([np.argmax(bound) for bound in lowerBounds ] ) sortedBestValid = np.argsort(validLowerBound)[::-1] for idx in sortedBestValid: print("{:s} has lower accuracy of {:.3f} at condition {:.2g}".\ format(labels[idx],validLowerBound[idx],bestIdx[idx])) i=0 fontsize=20 for meanV,stdV,pVals,lab in zip(mean,std,params,labels): ax=plt.subplot(np.ceil(nTrials/rowsPerPlot),rowsPerPlot,i+1) plt.errorbar(pVals,meanV[:,0],stdV[:,0],fmt='o-',color=colors[i], label='train') plt.errorbar(pVals,meanV[:,1],stdV[:,1],fmt='x--',color=colors[i], label='vld') ax.set_xscale('log') plt.axhline(0.8,color='r',linestyle='--') plt.ylim([lowerAcc*0.9,1]) plt.xlim([minP*0.7,maxP*1.3]) plt.title(lab,fontsize=fontsize) i+=1 plt.xlabel('Classifier parameter') plt.ylabel('Accuracy') pPlotUtil.savefig(fig,fullOutput + 'allAcc')
def plotAccuracies(outDir,label,accMean,accStd,fitParam): fig = pPlotUtil.figure() plt.errorbar(fitParam,accMean[:,0],accStd[:,0],fmt='ro-', label="Training Set") plt.errorbar(fitParam,accMean[:,1],accStd[:,1],fmt='kx-', label="Validation Set") plt.xscale('log', nonposy='clip') plt.axhline(1,color='b',linestyle='--',label='max') plt.xlabel("Fit parameter") plt.ylabel("Accuracy") plt.xlim([min(fitParam)*0.7,max(fitParam)*1.3]) plt.legend(loc='best') plt.title('Accuracy vs fit parameter for fitter: {:s}'.format(label)) pPlotUtil.savefig(fig,outDir + "accuracies")
def predict(fitter,x,yReal,rawDat,label,saveDir,colNames,fitterCoeff,objClass, featureObjects,saveBad=False,saveCoeffs=True,plot=True): try: yPred = fitter.predict(x) except TypeError: yPred = fitter.predict(x.toarray()) cm = confusion_matrix(yReal,yPred) acc= accuracy_score(yReal,yPred) # Show confusion matrix in a separate window if (saveBad): # XXX could profile? profileLosers(saveDir,label,yPred,yReal,rawDat,objClass,x, featureObjects) if (plot): fig = pPlotUtil.figure() ax = plt.subplot(1,1,1) numCols = colNames.size coeffs = fitterCoeff(fitter) nCoeffs = coeffs.size xRange = range(nCoeffs) saveName = saveDir + label + "coeffs" sortIdx = np.argsort(coeffs)[::-1] sortedCoeffs = coeffs[sortIdx] sortedNames = colNames[sortIdx] sortedFeatures = [featureObjects[s] for s in sortIdx] stacked = np.vstack((sortedNames,sortedCoeffs)).T np.savetxt(saveName,stacked,fmt=["%s","%.3g"],delimiter="\t") print numCols, " Columns" maxToPlot = min(numCols//2,25) # on each side if( numCols == nCoeffs): # then we have a coefficient per feature (column), so use them for ticks coeffsToPlot = list(sortedCoeffs[:maxToPlot]) + \ list(sortedCoeffs[-maxToPlot:]) labelsToPlot = list(sortedNames[:maxToPlot]) +\ list(sortedNames[-maxToPlot:]) featuresPlotted = list(sortedFeatures[:maxToPlot]) + \ list(sortedFeatures[-maxToPlot:]) xToPlot = range(len(coeffsToPlot)) ax.bar(xToPlot,coeffsToPlot,align='center') ax.set_xticks(xToPlot) ax.set_xticklabels(labelsToPlot,rotation='vertical') plt.xlabel("coefficient name") plt.ylabel("Predictor strength") else: plt.plot(xRange,coeffs,'ro-') plt.xlabel("Fitter Coefficients") plt.ylabel("Predictor strength") pPlotUtil.savefig(fig,saveName) return acc
def plotFec(expect,algorithm,inFile,saveAs): time,sep,force = HDF5Util.GetTimeSepForce(inFile) fig = pPlotUtil.figure() IgorUtil.PlotFec(sep,force) # limit the axis to close to the touchoff (10% of range) # (plotFEC starts at 0) minV = min(sep) rangeSepNm = 1e9 * abs(max(sep)-minV) rangeX = [0,rangeSepNm/10] plt.xlim(rangeX) # plot the expected and algorithm locations as nm, normalized to min norm = lambda x : (x-minV)*1e9 plt.axvline(norm(expect), label="Expected surface location",lw=3,linestyle="--", color="g") plt.axvline(norm(algorithm),label="Algorithm surface location",lw=3, linestyle="--",color="k") pPlotUtil.legend() pPlotUtil.savefig(fig,saveAs)
def plotAll(kArrs,outDir): maxKeach = [max(k) for k in kArrs ] maxK = max(maxKeach) bins = range(maxK+1) numTrials = len(kArr) means = np.array([np.mean(k) for k in kArrs]) stdevs = np.array([np.std(k) for k in kArrs]) for i,k in enumerate(kArrs): fig = pPlotUtil.figure() plt.hist(k,bins=bins,align='left',label='Data from {:d} sequences'. format(int(numOligos)),normed=True) mean = means[i] plt.axvline(mean,color='r',label="Mean:{:.3f}".format(mean), linewidth=2.0) plt.xlim([0,maxK]) plt.xlabel('K, minimum k-mer with at most 1 occurence in DNA sequence') plt.ylabel('Proportion of occurences') plt.title('K histogram (normalized) for DNA sequences of length {:d}'. format(lengths[i])) plt.legend() pPlotUtil.savefig(fig,outDir + "k{:d}".format(i)) return means,stdevs
def profileLosers(saveDir,label,yPred,yActual,rawDat,dataClass,featureMat, featureObjects): # get what we got wrong badIdx,predictedDeath,predictedSurv = getIdxMistakes(yPred,yActual) nSurv = len(predictedSurv) nDead = len(predictedDeath) fig = pPlotUtil.figure(xSize=16,ySize=12,dpi=200) # get the matrix, all features 0 --> 1 toPlot = getNormalizedFeatureMatrix(badIdx,featureMat, lambda x: sortByPred(x,yPred,yActual)) # get the number of non-zero elements in each column aspectStr = plotFeatMatr(toPlot,featureObjects,featureMat,saveDir,label, badIdx) plt.axhline(len(predictedSurv),linewidth=3,color='c') plt.title("Line Divides {:d} actual deceased from {:d} actual survived".\ format(nSurv,nDead),y=1.3,fontsize=g_title) plt.legend(loc="upper right", bbox_to_anchor=(0.4, -0.4)) badVals = rawDat[badIdx,:] np.savetxt(saveDir + 'debug_{:s}.csv'.format(label),badVals,fmt="%s", delimiter=',') pPlotUtil.savefig(fig,saveDir + "mOut" + label,tight=True)
def plotSpotDist(mLabels,spots,outPath,subtractMean): colors = ['r', 'g', 'b', 'y','k'] nColors = len(colors) # go to nm mLabelsNm = mLabels * 1e9 mSetSpots = sorted(set(spots)) labelsBySpot = [] rawBySpot = [] flattenedFromMean = [] # first, get the spot-wise labelling for i,spot in enumerate(mSetSpots): # get the indices of the spots spotIdx = np.where(abs((spots - spot)) < 1e-9)[0] thisSpotLabels = mLabelsNm[spotIdx] meanV = np.mean(thisSpotLabels) if (subtractMean): thisSpotLabels -= meanV labelsBySpot.append(thisSpotLabels) flattenedFromMean.extend(thisSpotLabels) if (subtractMean): rawBySpot.append(thisSpotLabels + meanV) else: rawBySpot.append(thisSpotLabels) # get the min and max from the labelsBySpot array bins = np.linspace(min(flattenedFromMean),max(flattenedFromMean),10) fig = pPlotUtil.figure(xSize=12,ySize=12) ax = fig.add_subplot(111, projection='3d',) for i,thisSpotLabels in enumerate(labelsBySpot): mColor = colors[i % nColors] height,left = np.histogram(thisSpotLabels,bins=bins) ax.bar(left[:-1], height, zs=i,zdir='y', color=mColor, alpha=0.7, edgecolor="none",linewidth=0) xStr = r'$\Delta$ from Expected Surface Loc. [nm]' pPlotUtil.lazyLabel(xStr, "Surface Position (arb)", "Dependence of Surface Location Distribution on Position", zlab="Count") pPlotUtil.savefig(fig,outPath + "AllSpots.png") # get a figure showing the mean surface location, assuming # we reshape into an Nx(whatever) array N = 5 # -1: infer dimension meanVals = [np.mean(mList) for mList in rawBySpot] meanSurf = np.reshape(meanVals,(-1,N)) meanSurf -= np.min(meanSurf) fig = pPlotUtil.figure(ySize=14,xSize=10) ax = fig.add_subplot(111, projection='3d') # convert to nm (XXX assuming grid is 1micron for each) Nx = N Ny = meanSurf.shape[0] x = np.linspace(0, Nx, Nx) * 1e3 y = np.linspace(0, Ny, Ny) * 1e3 xv, yv = np.meshgrid(x, y) ax.plot_wireframe(xv,yv,meanSurf) pPlotUtil.lazyLabel("X Location [nm]","Y Location [nm]", "Surface Position Varies with height") pPlotUtil.zlabel("Surface height (relative to min)") pPlotUtil.savefig(fig,outPath + "Surface.png") fig = pPlotUtil.figure(ySize=14,xSize=10) plt.subplot(2,1,1) nPoints = len(flattenedFromMean) vals,edges,_=plt.hist(flattenedFromMean,bins=bins) # add a 'fudge' factor to make plotting better, fudgeX = (max(edges)-min(edges))*0.05 xlim = [min(edges)-fudgeX,max(edges)+fudgeX] yLim = [0,max(vals)] pPlotUtil.lazyLabel(xStr, "Number of counts", "Algorithm finds surface within 10nm, >98%, N={:d}".\ format(nPoints)) normed = [0,max(vals)/sum(vals)] plt.xlim(xlim) propAx = pPlotUtil.secondAxis(plt.gca(),"Proportion",normed,yColor="Red") propAx.axhline("0.05",color='r', label="5% of Curves",linestyle='--',linewidth=4.0) pPlotUtil.legend() # plot the CDF plt.subplot(2,1,2) # add a zero at the start, so the plot matches the PDF cdf = np.zeros(edges.size) cdf[1:] = np.cumsum(vals/sum(vals)) mX = edges plt.plot(mX,cdf,linestyle='--',linewidth=4,color='k') plt.xlim(xlim) pPlotUtil.lazyLabel(xStr, "Cummulative Proportion", ("Cummulative Density Function," + "Surface Detection Performance")) plt.gca().fill_between(mX, 0, cdf,alpha=0.3) pPlotUtil.savefig(fig,outPath + "FlatSpots.png")
lengths = np.array([2,4,8,16,32,64,128,256,512]) # save the K array: minimum k to have at most one k-mer # initialize to -1, so that we know when we have the minimum outDir = "./out/" pGenUtil.ensureDirExists(outDir) forceRun = False test = False # use checkpointing to save data, since it takes forever kArr = pCheckUtil.getCheckpoint('./tmp/check.pkl',getKSequence,forceRun, lengths,numOligos,weights,chars) meanVals,std = pCheckUtil.getCheckpoint('./tmp/meanStd.pkl',plotAll, forceRun,kArr,outDir) if (test): testDnaGeneration(chars,lengths,numOligos,weights) # plot the mean k vs dna length, l (in theory, k is approx log_1/q(l+1)) fig = pPlotUtil.figure() ax = plt.subplot(1,3,1) plt.errorbar(x=lengths,y=meanVals,yerr=std,fmt='ro-',label='Mean K') tKVals = getTheoryK(lengths,q) plt.plot(lengths,tKVals,'b--',label='Log_[1/q](l+1)') xLab = 'DNA Length (l)' plt.xlabel(xLab) plt.ylabel('Mean K value') plt.title('Mean K vs length') ax.set_xscale('log') plt.legend(loc='best') ax = plt.subplot(1,3,2) plotError(meanVals,tKVals,lengths,xLab,'Absolute Error in Mean K ', 'Absolute error in Mean K',ax,relative=False) ax = plt.subplot(1,3,3) plotError(meanVals,tKVals,lengths,xLab,'Relative Error in Mean K [0-->1]',
import PlotUtilities as pPlotUtil import CheckpointUtilities as pCheckUtil from scipy.stats import norm outDir = "./out/" pGenUtil.ensureDirExists(outDir) mean = 0 stdev = 1 epsilon = stdev / 100 nPoints = 1000 normDist = norm(loc=mean, scale=stdev) offsets = np.linspace(mean - 3 * stdev, mean + 3 * stdev, nPoints) probability = 2 * (normDist.cdf( (offsets + epsilon - mean) / stdev) - normDist.cdf( (offsets - epsilon - mean) / stdev)) fig = pPlotUtil.figure() plt.plot(offsets,probability,'r-', label="mu = {:.1f}, sigma = {:.1f}, epsilon = {:.2f}".\ format(mean,stdev,epsilon)) plt.xlabel("offset for CDF, c0") plt.ylabel("Probability (arbitrary units) to land within epsilon of c0") plt.axvline(0, color='k', linestyle='--', label="Maximum probability when centered near mu") plt.legend(loc='best') plt.title("Probability of landing within epsilon of c0 maximized near mu") pPlotUtil.savefig(fig, outDir + "q1_1")