Beispiel #1
0
def plotBinomials(dataMatrix,nVals,p):
    nTrials = nVals.size # rows are the trials
    # same the mean and variances...
    means = np.zeros(nTrials)
    varReal = np.zeros(nTrials)
    varDist = np.zeros(nTrials)
    for i,n in enumerate(nVals):
        means[i],varReal[i],varDist[i] =\
                    plotSingleHist(n,p,dataMatrix[i,:],outDir)
    # plot the means and variances
    fig = pPlotUtil.figure()
    plt.subplot(1,2,1)
    plt.title("Mean of g(xBar)-g(mu) approaches 0",fontsize=fontsize)
    expMean = 0
    plt.plot(nVals,means,'ko',label="Actual Mean")
    plt.axhline(expMean,color='b',linestyle='--',
                label="Expected Mean: {:.2g}".format(expMean))
    plt.ylim(-min(means),max(means)*1.1)
    plt.xlabel("Value of n for binomial",fontsize=fontsize)
    plt.ylabel("Value of g(xBar)-g(mu)",fontsize=fontsize)
    plt.legend(fontsize=fontsize)
    pPlotUtil.tickAxisFont()
    plt.subplot(1,2,2)
    plt.semilogy(nVals,varReal,'ko',label="Actual Variance")
    plt.semilogy(nVals,varDist,'b--',label="Expected Variance")    
    plt.title("Variance of g(xBar)-g(mu)\n approaches expected",
              fontsize=fontsize)
    plt.xlabel("Value of n for binomial",fontsize=fontsize)
    plt.ylabel("Value of g(xBar) variance",fontsize=fontsize)
    pPlotUtil.tickAxisFont()
    plt.legend(fontsize=fontsize)
    pPlotUtil.savefig(fig,outDir + "MeanVar")
Beispiel #2
0
def plotAlignments(outDir,alignments,scoreOptimal,label):
    fontSize = 25
    scores = [ getAlignScore(a) for a in alignments ] 
    fig = pPlotUtil.figure(xSize=24,ySize=12)
    plt.subplot(1,2,1)
    meanScore,stdevScore,bins = plotScoreHistograms(scores,fontSize,'k')
    plt.title("Shuffled DNA alignment Histogram",
              fontsize=fontSize)
    pdfFunc = norm(loc=meanScore,scale=stdevScore).pdf(bins)
    plotPDF = lambda :  plt.plot(bins,pdfFunc,'g--',linewidth=3.0,
                                 label="Normal(mean,var)")
    plotPDF()
    plt.legend(fontsize=fontSize)
    ax = plt.subplot(1,2,2)
    plotScoreHistograms(scores,fontSize)
    plotPDF()
    zScore = (scoreOptimal-meanScore)/stdevScore
    print("Z Score for {:s} is {:.2f}".format(label,zScore))
    # ??? is this the real p Value? Dont think so
    extrProb = 1-norm().cdf(zScore)
    plt.title(("Histogram of optimal alignment score for {:d} trials\n" + 
               "Optimal score: {:d}*sigma from shuffled mean.\n"
               "P(shuffled score>=optimal) ~ {:.5g}").\
              format(len(scores),int(zScore),extrProb),fontsize=fontSize)
    plt.axvline(scoreOptimal,color='r',linestyle='--',
                label="Optimal global alignment score using {:s}: {:d}".\
                format(label,int(scoreOptimal)))
    plt.legend(loc='best',fontsize=fontSize)
    pPlotUtil.savefig(fig,outDir+ "q2Histograms" + label)
Beispiel #3
0
def plotSingleHist(n,p,xTrials,outDir):
    # coverage is just a plotting artifact
    fig = pPlotUtil.figure()
    # mu: expected value of Binomial(n,p)
    # effectie variance
    dist,distMean,distVar,normalStd,normalDist,xVals,nBins = \
                            getDeltaModelDistr(n,p,xTrials)
    normV = normHist(dist,nBins,\
                     label=("Actual Distr: Mean={:.4f},Stdev={:.4f}").\
                     format(distMean,np.sqrt(distVar)))
    rawPDF = normalDist.pdf(xVals)
    plt.plot(xVals,rawPDF,'r--',linewidth=5.0,
             label="Theorertical Distr: Stdev={:.4f}".\
             format(normalStd))
    plt.title("Histogram for g(xBar)-g(mu) for n={:d},p={:.2f}".\
              format(int(n),p),fontsize=g_title)
    plt.xlabel("(g(Xbar)-g(mu)) ~ Normal(0,[g'(x)*sigma]^2/n)",
               fontsize=g_label)
    plt.ylabel("Proportion",fontsize=g_label)
    plt.legend(frameon=False)
    pPlotUtil.tickAxisFont()
    catArr = list(rawPDF) + list(normV)
    plt.ylim([0,max(catArr)*1.2])
    plt.xlim([-max(nBins),max(nBins)])
    pPlotUtil.tickAxisFont()
    pPlotUtil.savefig(fig,outDir + "trial_n{:d}".format(int(n)))
    #return the statistics for plotting
    return distMean,distVar,normalStd**2
Beispiel #4
0
def plotSingleHist(n, p, xTrials, outDir):
    # coverage is just a plotting artifact
    fig = pPlotUtil.figure()
    # mu: expected value of Binomial(n,p)
    # effectie variance
    dist,distMean,distVar,normalStd,normalDist,xVals,nBins = \
                            getDeltaModelDistr(n,p,xTrials)
    normV = normHist(dist,nBins,\
                     label=("Actual Distr: Mean={:.4f},Stdev={:.4f}").\
                     format(distMean,np.sqrt(distVar)))
    rawPDF = normalDist.pdf(xVals)
    plt.plot(xVals,rawPDF,'r--',linewidth=5.0,
             label="Theorertical Distr: Stdev={:.4f}".\
             format(normalStd))
    plt.title("Histogram for g(xBar)-g(mu) for n={:d},p={:.2f}".\
              format(int(n),p),fontsize=g_title)
    plt.xlabel("(g(Xbar)-g(mu)) ~ Normal(0,[g'(x)*sigma]^2/n)",
               fontsize=g_label)
    plt.ylabel("Proportion", fontsize=g_label)
    plt.legend(frameon=False)
    pPlotUtil.tickAxisFont()
    catArr = list(rawPDF) + list(normV)
    plt.ylim([0, max(catArr) * 1.2])
    plt.xlim([-max(nBins), max(nBins)])
    pPlotUtil.tickAxisFont()
    pPlotUtil.savefig(fig, outDir + "trial_n{:d}".format(int(n)))
    #return the statistics for plotting
    return distMean, distVar, normalStd**2
Beispiel #5
0
def plotBinomials(dataMatrix, nVals, p):
    nTrials = nVals.size  # rows are the trials
    # same the mean and variances...
    means = np.zeros(nTrials)
    varReal = np.zeros(nTrials)
    varDist = np.zeros(nTrials)
    for i, n in enumerate(nVals):
        means[i],varReal[i],varDist[i] =\
                    plotSingleHist(n,p,dataMatrix[i,:],outDir)
    # plot the means and variances
    fig = pPlotUtil.figure()
    plt.subplot(1, 2, 1)
    plt.title("Mean of g(xBar)-g(mu) approaches 0", fontsize=fontsize)
    expMean = 0
    plt.plot(nVals, means, 'ko', label="Actual Mean")
    plt.axhline(expMean,
                color='b',
                linestyle='--',
                label="Expected Mean: {:.2g}".format(expMean))
    plt.ylim(-min(means), max(means) * 1.1)
    plt.xlabel("Value of n for binomial", fontsize=fontsize)
    plt.ylabel("Value of g(xBar)-g(mu)", fontsize=fontsize)
    plt.legend(fontsize=fontsize)
    pPlotUtil.tickAxisFont()
    plt.subplot(1, 2, 2)
    plt.semilogy(nVals, varReal, 'ko', label="Actual Variance")
    plt.semilogy(nVals, varDist, 'b--', label="Expected Variance")
    plt.title("Variance of g(xBar)-g(mu)\n approaches expected",
              fontsize=fontsize)
    plt.xlabel("Value of n for binomial", fontsize=fontsize)
    plt.ylabel("Value of g(xBar) variance", fontsize=fontsize)
    pPlotUtil.tickAxisFont()
    plt.legend(fontsize=fontsize)
    pPlotUtil.savefig(fig, outDir + "MeanVar")
def plotErrorAnalysis(mean,std,params,labels,fullOutput):
    rowsPerPlot = min(4,len(mean))
    fig = pPlotUtil.figure(xSize=rowsPerPlot*6,ySize=len(mean)*4)
    nTrials = len(mean)
    colors = pPlotUtil.cmap(nTrials)
    minP = min([ min(p) for p in params] )
    maxP = max([ max(p) for p in params] )
    lowerAcc = min([min(acc.flatten()) for acc in mean])
    lowerBounds = [(meanV[:,1]-stdV[:,1]) for meanV,stdV in zip(mean,std) ]
    validLowerBound = np.array([np.max(bound) for bound in lowerBounds ])
    bestIdx = np.array([np.argmax(bound) for bound in lowerBounds ] )
    sortedBestValid = np.argsort(validLowerBound)[::-1]
    for idx in sortedBestValid:
        print("{:s} has lower accuracy of {:.3f} at condition {:.2g}".\
            format(labels[idx],validLowerBound[idx],bestIdx[idx]))
    i=0
    fontsize=20
    for meanV,stdV,pVals,lab in zip(mean,std,params,labels):
        ax=plt.subplot(np.ceil(nTrials/rowsPerPlot),rowsPerPlot,i+1)
        plt.errorbar(pVals,meanV[:,0],stdV[:,0],fmt='o-',color=colors[i],
                     label='train')
        plt.errorbar(pVals,meanV[:,1],stdV[:,1],fmt='x--',color=colors[i],
                     label='vld')
        ax.set_xscale('log')
        plt.axhline(0.8,color='r',linestyle='--')
        plt.ylim([lowerAcc*0.9,1])
        plt.xlim([minP*0.7,maxP*1.3]) 
        plt.title(lab,fontsize=fontsize)
        i+=1
        plt.xlabel('Classifier parameter')
        plt.ylabel('Accuracy')
    pPlotUtil.savefig(fig,fullOutput + 'allAcc')
def plotAccuracies(outDir,label,accMean,accStd,fitParam):
    fig = pPlotUtil.figure()
    plt.errorbar(fitParam,accMean[:,0],accStd[:,0],fmt='ro-',
                 label="Training Set")
    plt.errorbar(fitParam,accMean[:,1],accStd[:,1],fmt='kx-',
                 label="Validation Set")
    plt.xscale('log', nonposy='clip')
    plt.axhline(1,color='b',linestyle='--',label='max')
    plt.xlabel("Fit parameter")
    plt.ylabel("Accuracy")
    plt.xlim([min(fitParam)*0.7,max(fitParam)*1.3])
    plt.legend(loc='best')
    plt.title('Accuracy vs fit parameter for fitter: {:s}'.format(label))
    pPlotUtil.savefig(fig,outDir + "accuracies")
def predict(fitter,x,yReal,rawDat,label,saveDir,colNames,fitterCoeff,objClass,
            featureObjects,saveBad=False,saveCoeffs=True,plot=True):
    try:
        yPred = fitter.predict(x)
    except TypeError:
        yPred = fitter.predict(x.toarray())
    cm = confusion_matrix(yReal,yPred)
    acc= accuracy_score(yReal,yPred)
    # Show confusion matrix in a separate window
    if (saveBad):
        # XXX could profile?
        profileLosers(saveDir,label,yPred,yReal,rawDat,objClass,x,
                      featureObjects)
    if (plot):
        fig = pPlotUtil.figure()
        ax = plt.subplot(1,1,1)
        numCols = colNames.size
        coeffs = fitterCoeff(fitter)
        nCoeffs = coeffs.size
        xRange = range(nCoeffs)
        saveName = saveDir + label + "coeffs"
        sortIdx = np.argsort(coeffs)[::-1]
        sortedCoeffs = coeffs[sortIdx]
        sortedNames = colNames[sortIdx]
        sortedFeatures = [featureObjects[s] for s in sortIdx]
        stacked = np.vstack((sortedNames,sortedCoeffs)).T
        np.savetxt(saveName,stacked,fmt=["%s","%.3g"],delimiter="\t")
        print numCols, " Columns"
        maxToPlot = min(numCols//2,25) # on each side

        if( numCols == nCoeffs):
    # then we have a coefficient per feature (column), so use them for ticks
            coeffsToPlot = list(sortedCoeffs[:maxToPlot]) + \
                           list(sortedCoeffs[-maxToPlot:])
            labelsToPlot = list(sortedNames[:maxToPlot]) +\
                           list(sortedNames[-maxToPlot:])
            featuresPlotted = list(sortedFeatures[:maxToPlot]) + \
                              list(sortedFeatures[-maxToPlot:])
            xToPlot = range(len(coeffsToPlot))
            ax.bar(xToPlot,coeffsToPlot,align='center')
            ax.set_xticks(xToPlot)
            ax.set_xticklabels(labelsToPlot,rotation='vertical')
            plt.xlabel("coefficient name")
            plt.ylabel("Predictor strength")
        else:
            plt.plot(xRange,coeffs,'ro-')
            plt.xlabel("Fitter Coefficients")
            plt.ylabel("Predictor strength")
        pPlotUtil.savefig(fig,saveName)
    return acc
def plotFec(expect,algorithm,inFile,saveAs):
    time,sep,force = HDF5Util.GetTimeSepForce(inFile)
    fig = pPlotUtil.figure()
    IgorUtil.PlotFec(sep,force)
    # limit the axis to close to the touchoff (10% of range)
    # (plotFEC starts at 0)
    minV = min(sep)
    rangeSepNm = 1e9 * abs(max(sep)-minV)
    rangeX = [0,rangeSepNm/10]
    plt.xlim(rangeX)
    # plot the expected and algorithm locations as nm, normalized to min
    norm  = lambda x : (x-minV)*1e9
    plt.axvline(norm(expect),
                label="Expected surface location",lw=3,linestyle="--",
                color="g")
    plt.axvline(norm(algorithm),label="Algorithm surface location",lw=3,
                linestyle="--",color="k")
    pPlotUtil.legend()
    pPlotUtil.savefig(fig,saveAs)
Beispiel #10
0
def plotAll(kArrs,outDir):
    maxKeach = [max(k) for k in kArrs ]
    maxK = max(maxKeach)
    bins = range(maxK+1)
    numTrials = len(kArr)
    means  = np.array([np.mean(k) for k in kArrs])
    stdevs = np.array([np.std(k) for k in kArrs])
    for i,k in enumerate(kArrs):
        fig = pPlotUtil.figure()
        plt.hist(k,bins=bins,align='left',label='Data from {:d} sequences'.
                 format(int(numOligos)),normed=True)
        mean = means[i]
        plt.axvline(mean,color='r',label="Mean:{:.3f}".format(mean),
                    linewidth=2.0)
        plt.xlim([0,maxK])
        plt.xlabel('K, minimum k-mer with at most 1 occurence in DNA sequence')
        plt.ylabel('Proportion of occurences')
        plt.title('K histogram (normalized) for DNA sequences  of length {:d}'.
                  format(lengths[i]))
        plt.legend()
        pPlotUtil.savefig(fig,outDir + "k{:d}".format(i))
    return means,stdevs
def profileLosers(saveDir,label,yPred,yActual,rawDat,dataClass,featureMat,
                  featureObjects):
    # get what we got wrong
    badIdx,predictedDeath,predictedSurv = getIdxMistakes(yPred,yActual)
    nSurv = len(predictedSurv)
    nDead = len(predictedDeath)
    fig = pPlotUtil.figure(xSize=16,ySize=12,dpi=200)
    # get the matrix, all features 0 --> 1
    toPlot = getNormalizedFeatureMatrix(badIdx,featureMat,
                                        lambda x: sortByPred(x,yPred,yActual))
    # get the number of non-zero elements in each column
    aspectStr = plotFeatMatr(toPlot,featureObjects,featureMat,saveDir,label,
                             badIdx)
    plt.axhline(len(predictedSurv),linewidth=3,color='c')
    plt.title("Line Divides {:d} actual deceased from {:d} actual survived".\
                format(nSurv,nDead),y=1.3,fontsize=g_title)
    plt.legend(loc="upper right", bbox_to_anchor=(0.4, -0.4))
    
    badVals = rawDat[badIdx,:]
    np.savetxt(saveDir + 'debug_{:s}.csv'.format(label),badVals,fmt="%s",
               delimiter=',')
    pPlotUtil.savefig(fig,saveDir + "mOut" + label,tight=True)
def plotSpotDist(mLabels,spots,outPath,subtractMean):
    colors = ['r', 'g', 'b', 'y','k']
    nColors = len(colors)
    # go to nm
    mLabelsNm = mLabels *  1e9
    mSetSpots = sorted(set(spots))
    labelsBySpot = []
    rawBySpot = []
    flattenedFromMean = []
    # first, get the spot-wise labelling
    for i,spot in enumerate(mSetSpots):
        # get the indices of the spots
        spotIdx = np.where(abs((spots - spot)) < 1e-9)[0]
        thisSpotLabels = mLabelsNm[spotIdx]
        meanV = np.mean(thisSpotLabels)
        if (subtractMean):
            thisSpotLabels -= meanV
        labelsBySpot.append(thisSpotLabels)
        flattenedFromMean.extend(thisSpotLabels)
        if (subtractMean):
            rawBySpot.append(thisSpotLabels + meanV)
        else:
            rawBySpot.append(thisSpotLabels)        
    #  get the min and max from the labelsBySpot array
    bins = np.linspace(min(flattenedFromMean),max(flattenedFromMean),10)
    fig = pPlotUtil.figure(xSize=12,ySize=12)
    ax = fig.add_subplot(111, projection='3d',)
    for i,thisSpotLabels in enumerate(labelsBySpot):
        mColor = colors[i % nColors]
        height,left = np.histogram(thisSpotLabels,bins=bins)
        ax.bar(left[:-1], height, zs=i,zdir='y', color=mColor, alpha=0.7,
               edgecolor="none",linewidth=0)
    xStr = r'$\Delta$ from Expected Surface Loc. [nm]'
    pPlotUtil.lazyLabel(xStr,
                        "Surface Position (arb)",
                    "Dependence of Surface Location Distribution on Position",
                        zlab="Count")
    pPlotUtil.savefig(fig,outPath + "AllSpots.png")
    # get a figure showing the mean surface location, assuming
    # we reshape into an Nx(whatever) array
    N = 5
    # -1: infer dimension
    meanVals = [np.mean(mList) for mList in rawBySpot]
    meanSurf = np.reshape(meanVals,(-1,N))
    meanSurf -= np.min(meanSurf)
    fig = pPlotUtil.figure(ySize=14,xSize=10)
    ax = fig.add_subplot(111, projection='3d')
    # convert to nm (XXX assuming grid is 1micron for each)
    Nx = N
    Ny = meanSurf.shape[0]
    x = np.linspace(0, Nx, Nx) * 1e3
    y = np.linspace(0, Ny, Ny) * 1e3
    xv, yv = np.meshgrid(x, y)
    ax.plot_wireframe(xv,yv,meanSurf)
    pPlotUtil.lazyLabel("X Location [nm]","Y Location [nm]",
                        "Surface Position Varies with height")
    pPlotUtil.zlabel("Surface height (relative to min)")
    pPlotUtil.savefig(fig,outPath + "Surface.png")
    fig = pPlotUtil.figure(ySize=14,xSize=10)
    plt.subplot(2,1,1)
    nPoints = len(flattenedFromMean)
    vals,edges,_=plt.hist(flattenedFromMean,bins=bins)
    # add a 'fudge' factor to make plotting better,
    fudgeX = (max(edges)-min(edges))*0.05
    xlim = [min(edges)-fudgeX,max(edges)+fudgeX]
    yLim = [0,max(vals)]
    pPlotUtil.lazyLabel(xStr,
                        "Number of counts",
                        "Algorithm finds surface within 10nm, >98%, N={:d}".\
                        format(nPoints))
    normed = [0,max(vals)/sum(vals)]
    plt.xlim(xlim)
    propAx = pPlotUtil.secondAxis(plt.gca(),"Proportion",normed,yColor="Red")
    propAx.axhline("0.05",color='r',
                   label="5% of Curves",linestyle='--',linewidth=4.0)
    pPlotUtil.legend()
    # plot the CDF 
    plt.subplot(2,1,2)
    # add a zero at the start, so the plot matches the PDF
    cdf = np.zeros(edges.size)
    cdf[1:] = np.cumsum(vals/sum(vals))
    mX = edges
    plt.plot(mX,cdf,linestyle='--',linewidth=4,color='k')
    plt.xlim(xlim)
    pPlotUtil.lazyLabel(xStr,
                        "Cummulative Proportion",
                        ("Cummulative Density Function," +
                         "Surface Detection Performance"))
    plt.gca().fill_between(mX, 0, cdf,alpha=0.3)
    pPlotUtil.savefig(fig,outPath + "FlatSpots.png")
Beispiel #13
0
 lengths = np.array([2,4,8,16,32,64,128,256,512])
 # save the K array: minimum k to have at most one k-mer
 # initialize to -1, so that we know when we have the minimum
 outDir = "./out/"
 pGenUtil.ensureDirExists(outDir)
 forceRun = False
 test = False
 # use checkpointing to save data, since it takes forever
 kArr = pCheckUtil.getCheckpoint('./tmp/check.pkl',getKSequence,forceRun,
                                 lengths,numOligos,weights,chars)
 meanVals,std = pCheckUtil.getCheckpoint('./tmp/meanStd.pkl',plotAll,
                                         forceRun,kArr,outDir)
 if (test):
     testDnaGeneration(chars,lengths,numOligos,weights)
 # plot the mean k vs dna length, l (in theory, k is approx log_1/q(l+1))
 fig = pPlotUtil.figure()
 ax = plt.subplot(1,3,1)
 plt.errorbar(x=lengths,y=meanVals,yerr=std,fmt='ro-',label='Mean K')
 tKVals = getTheoryK(lengths,q)
 plt.plot(lengths,tKVals,'b--',label='Log_[1/q](l+1)')
 xLab = 'DNA Length (l)'
 plt.xlabel(xLab)
 plt.ylabel('Mean K value')
 plt.title('Mean K vs length')
 ax.set_xscale('log')
 plt.legend(loc='best')
 ax = plt.subplot(1,3,2)
 plotError(meanVals,tKVals,lengths,xLab,'Absolute Error in Mean K ',
           'Absolute error in Mean K',ax,relative=False)
 ax = plt.subplot(1,3,3)
 plotError(meanVals,tKVals,lengths,xLab,'Relative Error in Mean K [0-->1]',
Beispiel #14
0
import PlotUtilities as pPlotUtil
import CheckpointUtilities as pCheckUtil

from scipy.stats import norm
outDir = "./out/"
pGenUtil.ensureDirExists(outDir)

mean = 0
stdev = 1
epsilon = stdev / 100
nPoints = 1000
normDist = norm(loc=mean, scale=stdev)
offsets = np.linspace(mean - 3 * stdev, mean + 3 * stdev, nPoints)
probability = 2 * (normDist.cdf(
    (offsets + epsilon - mean) / stdev) - normDist.cdf(
        (offsets - epsilon - mean) / stdev))

fig = pPlotUtil.figure()
plt.plot(offsets,probability,'r-',
         label="mu = {:.1f}, sigma = {:.1f}, epsilon = {:.2f}".\
         format(mean,stdev,epsilon))
plt.xlabel("offset for CDF, c0")
plt.ylabel("Probability (arbitrary units) to land within epsilon of c0")
plt.axvline(0,
            color='k',
            linestyle='--',
            label="Maximum probability when centered near mu")
plt.legend(loc='best')
plt.title("Probability of landing within epsilon of c0 maximized near mu")
pPlotUtil.savefig(fig, outDir + "q1_1")