Python scale_a_list Examples

Programming Language: Python

Namespace/Package Name: myUtils

Method/Function: scale_a_list

Examples at hotexamples.com: 10

Python scale_a_list - 10 examples found. These are the top rated real world Python examples of myUtils.scale_a_list extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: fithic.py Project: maleilei/fithic

def compare_Spline_FDR(splineFDRxinit,splineFDRyinit,splineFDRx,splineFDRy,figname,i):
    newlab = 'spline-' + str(i)
    plt.clf()
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)

    plt.plot(splineFDRx[1:],myUtils.scale_a_list(splineFDRy[1:],toKb), 'r+-',label=newlab)
    plt.plot(splineFDRxinit[1:],myUtils.scale_a_list(splineFDRyinit[1:],toKb), 'g.-',label='spline-1')
    plt.xlabel('FDR threshold')
    plt.ylabel('Significant contacts (x10$^{3}$)')
    plt.gca().yaxis.set_major_locator( MaxNLocator(prune='lower'))
    lg=ax.legend(loc="lower right")
    lg.draw_frame(False)
    plt.savefig(figname+'.png')

Example #2

Show file

File: fithic.py Project: ay-lab/fithic

def compare_Spline_FDR(splineFDRxinit,splineFDRyinit,splineFDRx,splineFDRy,figname,i):
    newlab = 'spline-' + str(i)
    plt.clf()
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)

    plt.plot(splineFDRx[1:],myUtils.scale_a_list(splineFDRy[1:],toKb), 'r+-',label=newlab)
    plt.plot(splineFDRxinit[1:],myUtils.scale_a_list(splineFDRyinit[1:],toKb), 'g.-',label='spline-1')
    plt.xlabel('FDR threshold')
    plt.ylabel('Significant contacts (x10$^{3}$)')
    plt.gca().yaxis.set_major_locator( MaxNLocator(prune='lower'))
    lg=ax.legend(loc="lower right")
    lg.draw_frame(False)
    plt.savefig(figname+'.png')

Example #3

Show file

def compareFits_Spline(splineXinit, splineYinit, splineX, splineY, figname, X):
    downsample = min(5000, len(splineXinit))
    plt.clf()
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    x = splineXinit
    y = splineYinit
    col = 'g.-'
    lab = 'spline-1'
    randIndcs = [i for i in range(len(x))]
    randIndcs = np.random.choice(randIndcs, downsample)
    randIndcs = sorted(randIndcs)
    x = myUtils.scale_a_list([x[i] for i in randIndcs], toKb)
    y = myUtils.scale_a_list([y[i] for i in randIndcs], toProb)
    plt.plot(x, y, col, label=lab)

    if figname[-1] != '1':  # meaning this is not the very first step
        x = splineX
        y = splineY
        col = 'r.-'
        lab = 'spline-' + X
        randIndcs = [i for i in range(len(x))]
        randIndcs = np.random.choice(randIndcs, downsample)
        randIndcs = sorted(randIndcs)
        x = myUtils.scale_a_list([x[i] for i in randIndcs], toKb)
        y = myUtils.scale_a_list([y[i] for i in randIndcs], toProb)
        plt.plot(x, y, col, label=lab)
    else:  # plot only at a limited range and plot discrete binning
        if max(x) > 1000:  # if it's a big genome
            plt.xlim([500, 1000])
            plt.ylim([0, 1.0])
        else:  # small genome
            plt.xlim([50, 100])
            plt.ylim([0, 0.5])

    ax.legend(loc="upper right")
    plt.xlabel('Genomic distance (kb)')
    plt.ylabel('Contact probability (x10$^{-5}$)')
    plt.gca().yaxis.set_major_locator(MaxNLocator(prune='lower'))
    plt.savefig(figname + '.png')

Example #4

Show file

File: fithic.py Project: ay-lab/fithic

def compareFits_Spline(splineXinit,splineYinit,splineX,splineY,figname,X):
    downsample=min(5000,len(splineXinit))
    plt.clf()
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1)
    x=splineXinit
    y=splineYinit
    col='g.-'
    lab='spline-1'
    randIndcs=[i for i in range(len(x))]
    randIndcs=np.random.choice(randIndcs,downsample)
    randIndcs=sorted(randIndcs)
    x=myUtils.scale_a_list([x[i] for i in randIndcs],toKb)
    y=myUtils.scale_a_list([y[i] for i in randIndcs],toProb)
    plt.plot(x,y,col,label=lab)

    if figname[-1]!='1': # meaning this is not the very first step
        x=splineX
        y=splineY
        col='r.-'
        lab='spline-'+X
        randIndcs=[i for i in range(len(x))]
        randIndcs=np.random.choice(randIndcs,downsample)
        randIndcs=sorted(randIndcs)
        x=myUtils.scale_a_list([x[i] for i in randIndcs],toKb)
        y=myUtils.scale_a_list([y[i] for i in randIndcs],toProb)
        plt.plot(x,y,col,label=lab)
    else: # plot only at a limited range and plot discrete binning
        if max(x)>1000: # if it's a big genome
            plt.xlim([500,1000])
            plt.ylim([0,1.0])
        else: # small genome
            plt.xlim([50,100])
            plt.ylim([0,0.5])

    ax.legend(loc="upper right")
    plt.xlabel('Genomic distance (kb)')
    plt.ylabel('Contact probability (x10$^{-5}$)')
    plt.gca().yaxis.set_major_locator( MaxNLocator(prune='lower'))
    plt.savefig(figname+'.png')

Example #5

Show file

def fit_Spline(mainDic, x, y, yerr, infilename, outfilename, biasDic,
               outliersline, outliersdist, observedIntraInRangeSum,
               possibleIntraInRangeCount, possibleInterAllCount,
               observedIntraAllSum, observedInterAllSum, resolution, passNo):
    with open(logfile, 'a') as log:
        log.write("\nFitting a univariate spline to the probability means\n"),
        log.write(
            "------------------------------------------------------------------------------------\n"
        ),

    splineX = None
    newSplineY = None
    residual = None
    FDRx = None
    FDRy = None

    if not interOnly:
        if outliersdist != None:
            y = [f for _, f in sorted(zip(x, y), key=lambda pair: pair[0])]
            x.sort()
        for i in range(1, len(x)):
            if x[i] <= x[i - 1]:
                print(
                    "ERROR in spline fitting. Distances do not decrease across bins. Ensure interaction file is correct."
                )
                print("Avg. distance of bin(i-1)... %s" % x[i - 1])
                print("Avg. distance of bin(i)... %s" % x[i])
                sys.exit(2)

        # maximum residual allowed for spline is set to min(y)^2
        splineError = min(y) * min(y)

        # use fitpack2 method -fit on the real x and y from equal occupancy binning
        ius = UnivariateSpline(x, y, s=splineError)
        tempMaxX = max(x)
        tempMinX = min(x)
        tempList = sorted([dis for dis in mainDic])
        splineX = []
        ### The below for loop will make sure nothing is out of range of [min(x) max(x)]
        ### Therefore everything will be within the range where the spline is defined
        for i in tempList:
            if tempMinX <= i <= tempMaxX:
                splineX.append(i)
        splineY = ius(splineX)
        #print(splineY)
        #print(yerr)

        ir = IsotonicRegression(increasing=False)
        newSplineY = ir.fit_transform(splineX, splineY)
        #print(newSplineY)
        residual = sum([i * i for i in (y - ius(x))])

        if visual == True:
            xi = np.linspace(min(x), max(x), 5 * len(x))
            yi = ius(xi)

            print("Plotting %s" % (outfilename + ".png"))
            plt.clf()
            fig = plt.figure()
            ax = fig.add_subplot(2, 1, 1)
            plt.plot(myUtils.scale_a_list(splineX, toKb),
                     myUtils.scale_a_list(newSplineY, toProb),
                     'g-',
                     label="spline-" + str(passNo),
                     linewidth=2)
            plt.errorbar(myUtils.scale_a_list(x, toKb),
                         myUtils.scale_a_list(y, toProb),
                         myUtils.scale_a_list(yerr, toProb),
                         fmt='r.',
                         label="Mean with std. error",
                         linewidth=2)

            #plt.ylabel('Contact probability (x10$^{-5}$)',fontsize='large')
            #plt.xlabel('Genomic distance (kb)',fontsize='large')
            plt.ylabel('Contact probability (x10$^{-5}$)')
            plt.xlabel('Genomic distance (kb)')
            if distLowThres > 0 and distUpThres < float("inf"):
                plt.xlim(
                    myUtils.scale_a_list([distLowThres, distUpThres], toKb))
            plt.gca().yaxis.set_major_locator(MaxNLocator(nbins=3, prune=None))
            ax.legend(loc="upper right")

            ax = fig.add_subplot(2, 1, 2)

            plt.loglog(splineX, newSplineY, 'g-')
            plt.errorbar(x, y, yerr=yerr, fmt='r.')  # Data
            if distLowThres > 0 and distUpThres < float("inf"):
                plt.xlim([distLowThres, distUpThres])
            plt.ylabel('Contact probability (log-scale)')
            plt.xlabel('Genomic distance (log-scale)')

            plt.savefig(outfilename + '.png')

    # NOW write the calculated pvalues and corrected pvalues in a file
    infile = gzip.open(infilename, 'rt')
    intraInRangeCount = 0
    intraOutOfRangeCount = 0
    intraVeryProximalCount = 0
    interCount = 0
    discardCount = 0
    p_vals = []
    q_vals = []
    biasl = []
    biasr = []
    for line in infile:
        ch1, mid1, ch2, mid2, contactCount = line.rstrip().split()
        contactCount = float(contactCount)
        interxn = myUtils.Interaction([ch1, int(mid1), ch2, int(mid2)])
        interxn.setCount(contactCount)
        mid1 = int(mid1)
        mid2 = int(mid2)
        interactionType = interxn.getType(distLowThres, distUpThres)
        bias1 = 1.0
        bias2 = 1.0
        # assumes there is no bias to begin with
        # if the biasDic is not null sets the real bias values
        if biasDic:
            if ch1 in biasDic and mid1 in biasDic[ch1]:
                bias1 = biasDic[ch1][mid1]
            if ch2 in biasDic and mid2 in biasDic[ch2]:
                bias2 = biasDic[ch2][mid2]
        biasl.append(bias1)
        biasr.append(bias2)
        if (bias1 < 0 or bias2 < 0) and interactionType != 'inter':
            prior_p = 1.0
            p_val = 1.0
            discardCount += 1
        elif interactionType == 'intraInRange' and not interOnly:
            distToLookUp = max(interxn.getDistance(), min(x))
            distToLookUp = min(distToLookUp, max(x))
            i = min(bisect.bisect_left(splineX, distToLookUp),
                    len(splineX) - 1)
            prior_p = newSplineY[i] * (bias1 * bias2)
            p_val = scsp.bdtrc(interxn.getCount() - 1, observedIntraInRangeSum,
                               prior_p)
            intraInRangeCount += 1
        elif interactionType == 'intraShort' and not interOnly:
            prior_p = 1.0
            p_val = 1.0
            intraVeryProximalCount += 1
        elif interactionType == 'intraLong' and not interOnly:
            prior_p = 1.0
            #p_val=scsp.bdtrc(interxn.getCount()-1, observedIntraAllSum,prior_p) ##RUNBY
            p_val = 1.0
            intraOutOfRangeCount += 1
        else:
            if allReg or interOnly:
                prior_p = interChrProb * (bias1 * bias2)
                p_val = scsp.bdtrc(interxn.getCount() - 1, observedInterAllSum,
                                   prior_p)
                interCount += 1
            else:
                p_val = 1.0
                #p_vals.append(p_val)
        p_vals.append(p_val)
    infile.close()

    outlierThres = 0
    # Do the BH FDR correction
    if allReg:
        outlierThres = 1.0 / (possibleIntraInRangeCount +
                              possibleInterAllCount)
        q_vals = myStats.benjamini_hochberg_correction(
            p_vals, possibleInterAllCount + possibleIntraInRangeCount)
    elif interOnly and not allReg:
        outlierThres = 1.0 / possibleInterAllCount
        q_vals = myStats.benjamini_hochberg_correction(p_vals,
                                                       possibleInterAllCount)
    else:
        outlierThres = 1.0 / possibleIntraInRangeCount
        q_vals = myStats.benjamini_hochberg_correction(
            p_vals, possibleIntraInRangeCount)
    print("Outlier threshold is... %s" % (outlierThres))

    #now we write the values back to the file
    infile = gzip.open(infilename, 'rt')
    if resolution:
        outfile = gzip.open(
            outfilename + '.res' + str(resolution) + '.significances.txt.gz',
            'wt')
    else:
        outfile = gzip.open(outfilename + '.significances.txt.gz', 'wt')
    print("Writing p-values and q-values to file %s" %
          (outfilename + ".significances.txt"))
    outfile.write(
        "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\tbias1\tbias2\n"
    )
    count = 0
    for line in infile:
        words = line.rstrip().split()
        chr1 = words[0]
        midPoint1 = int(words[1])
        chr2 = words[2]
        midPoint2 = int(words[3])
        interactionCount = float(words[4])
        p_val = p_vals[count]
        q_val = q_vals[count]
        bias1 = biasl[count]
        bias2 = biasr[count]

        if (allReg or interOnly) and chr1 != chr2:
            outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" %
                          (str(chr1), midPoint1, str(chr2), midPoint2,
                           interactionCount, p_val, q_val, bias1, bias2))
        if (allReg or not interOnly) and chr1 == chr2:
            interactionDistance = abs(midPoint1 - midPoint2)
            if myUtils.in_range_check(interactionDistance, distLowThres,
                                      distUpThres):
                outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" %
                              (str(chr1), midPoint1, str(chr2), midPoint2,
                               interactionCount, p_val, q_val, bias1, bias2))

        if p_val < outlierThres:
            outliersline.add(count)
            outliersdist.add(abs(midPoint1 - midPoint2))
        count += 1
    outfile.close()
    infile.close()
    if visual == True:
        print("Plotting q-values to file %s" % outfilename + ".qplot.png")
    minFDR = 0.0
    maxFDR = 0.05
    increment = 0.001
    FDRx, FDRy = plot_qvalues(q_vals, minFDR, maxFDR, increment,
                              outfilename + ".qplot")

    with open(logfile, 'a') as log:
        log.write("Spline successfully fit\n"),
        log.write("\n"),
        log.write("\n"),

    return [
        splineX, newSplineY, residual, outliersline, outliersdist, FDRx, FDRy
    ]  # from fit_Spline

Example #6

Show file

def calculate_Probabilities(sortedInteractions, isOutlier, figname):

    sys.stderr.write(
        "\nCalculating probability means and standard deviations by equal occupancy binning of interaction data\n"
    )
    sys.stderr.write(
        "------------------------------------------------------------------------------------\n"
    )

    outfile = open(outdir + '/' + figname + '.txt', 'w')

    # total interaction count to put on top of the plot
    # this may be different than observedIntraInRangeSum for the second iteration of fit-hic
    totalInteractionCountForPlot = 0
    lcount = 0
    for eachrow in sortedInteractions:
        if isOutlier[lcount] == 0:
            totalInteractionCountForPlot += eachrow[1]
        lcount += 1
    # END for
    desiredPerBin = (observedIntraInRangeSum) / noOfBins
    sys.stderr.write("observedIntraInRangeSum\t" +
                     repr(observedIntraInRangeSum) + "\tdesiredPerBin\t" +
                     repr(desiredPerBin) + "\tnoOfBins\t" + repr(noOfBins) +
                     "\n")

    # the following five lists will be the print outputs
    x = []  # avg genomic distances of bins
    y = []  # avg interaction probabilities of bins
    yerr = []  # stderrs of bins
    pairCounts = []  # number of pairs in bins
    interactionTotals = []  # number of interactions (reads) in bins

    # the following variables will be used to calculate the above five lists
    noOfPairsForBin = 0
    meanCountPerPair = 0
    M2 = 0
    interactionTotalForBin = 0
    interactionTotalForBinTermination = 0
    distanceTotalForBin = 0
    lastDistanceForBin = -1
    lastInteraction = lcount
    lcount = 0  # this will increase by eachrow in sortedInteractions

    for eachrow in sortedInteractions:
        interactionDistance = eachrow[0]
        interactionCount = eachrow[1]

        # if one bin is full or it's the last bin
        if noOfPairsForBin>0 and ((useBinning==False and lastDistanceForBin!=-1 and lastDistanceForBin!=interactionDistance) or\
         (useBinning==True and lastDistanceForBin!=-1 and interactionTotalForBinTermination >= desiredPerBin and\
         lastDistanceForBin!=interactionDistance) or lcount==lastInteraction):

            # calculate the things that need to be calculated
            avgDistance = (distanceTotalForBin / noOfPairsForBin) * distScaling
            meanProbabilityObsv = (meanCountPerPair *
                                   1.0) / observedIntraInRangeSum
            se_p = meanProbabilityObsv
            # update se_p if there are more than 1 pairs in the bin
            if noOfPairsForBin > 1:
                var = M2 / (noOfPairsForBin - 1)
                sd = math.sqrt(var)
                se = sd / math.sqrt(noOfPairsForBin)
                se_p = se / observedIntraInRangeSum
            # END if

            # append the calculated vals to corresponding lists
            x.append(float(avgDistance))
            y.append(float(meanProbabilityObsv))
            yerr.append(float(se_p))
            pairCounts.append(noOfPairsForBin)
            interactionTotals.append(interactionTotalForBin)

            # now that we saved what we need
            # set the values back to defaults and go on to the next bin
            noOfPairsForBin = 0
            meanCountPerPair = 0
            M2 = 0
            interactionTotalForBin = 0
            interactionTotalForBinTermination = 0
            distanceTotalForBin = 0
            lastDistanceForBin = -1
        # END if - that checks whether the bin is full etc.

        # Now go back to processing the read values of interactionDistance and interactionCount
        # this check is necessary for the second pass of fit-hic
        # we want to only use the non-outlier interactions in our
        # probability calculation
        if isOutlier[lcount] == 0:
            distanceTotalForBin += interactionDistance / distScaling
            interactionTotalForBin += interactionCount
            noOfPairsForBin += 1
            delta = interactionCount - meanCountPerPair
            meanCountPerPair += (delta * 1.0) / noOfPairsForBin
            M2 += delta * (interactionCount - meanCountPerPair)
        # END if
        interactionTotalForBinTermination += interactionCount
        lcount += 1
        lastDistanceForBin = interactionDistance
    # END for over sortedInteractions

    if visual == True:
        sys.stderr.write("Plotting %s" % figname + ".png\n")
        plt.clf()
        fig = plt.figure()
        ax = fig.add_subplot(111)
        plt.plot(myUtils.scale_a_list(x, toKb),
                 myUtils.scale_a_list(y, toProb),
                 'ro',
                 label="Mean")
        plt.errorbar(myUtils.scale_a_list(x, toKb),
                     myUtils.scale_a_list(y, toProb),
                     myUtils.scale_a_list(yerr, toProb),
                     fmt='k.',
                     label="Standard error")
        #plt.ylabel('Probability (1e-5)')
        plt.ylabel('Contact probability (x10$^{-5}$)')
        plt.xlabel('Genomic distance (kb)')
        titleStr='Binning observed interactions using equal occupancy bins.\n No. of bins: '\
         +str(noOfBins) +', Library: ' + str(libname)+ ', No. of interactions: ' +str(observedIntraInRangeSum)
        plt.title(titleStr, size='small')
        ax.legend(loc="upper right")
        plt.savefig(outdir + '/' + figname + '.png')

    sys.stderr.write("Writing %s" % figname + ".txt\n")

    outfile.write(
        "avgGenomicDist\tcontactProbability\tstandardError\tnoOfLocusPairs\ttotalOfContactCounts\n"
    )
    for i in range(len(x)):
        outfile.write("%d" % x[i] + "\t" + "%.2e" % y[i] + "\t" +
                      "%.2e" % yerr[i] + "\t" + "%d" % pairCounts[i] + "\t" +
                      "%d" % interactionTotals[i] + "\n")
    outfile.close()
    return [x, y, yerr]  # from calculate_Probabilities

Example #7

Show file

def fit_Spline(x, y, yerr, infilename, sortedInteractions, biasDic, figname,
               passNo):
    sys.stderr.write("\nFit a univariate spline to the probability means\n")
    sys.stderr.write(
        "------------------------------------------------------------------------------------\n"
    )
    sys.stderr.write("baseline intra-chr probability: " +
                     repr(baselineIntraChrProb) +
                     "\tbaseline inter-chr probability: " +
                     repr(baselineInterChrProb) + "\n")
    # xi and yi will be used only for visualization purposes
    # acutal fit and residual is all done on vectors x and y
    xi = np.linspace(min(x), max(x), overSample * len(x))

    # assume residualFactor==-1:
    splineError = min(y) * min(y)

    # use fitpack2 method -fit on the real x and y from equal occupancy binning
    ius = UnivariateSpline(x, y, s=splineError)
    yi = ius(xi)

    #### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING
    ### NOW I DO THIS BY CALLING AN R function CALLED MONOREG
    ### This does the isotonic regression using option antitonic to make sure
    ### I get monotonically decreasing probabilites with increasion genomic distance

    tempMaxX = max(x)
    tempMinX = min(x)
    tempList = sorted(list(set([int(i[0]) for i in sortedInteractions])))
    splineX = []
    ### The below for loop will make sure nothing is out of range of [min(x) max(x)]
    ### Therefore everything will be within the range where the spline is defined
    for i in tempList:
        if tempMinX <= i and i <= tempMaxX:
            splineX.append(i)
    # END for
    #print len(splineX)
    splineY = ius(splineX)

    # R vector format
    rSplineX = ro.FloatVector(splineX)
    rSplineY = ro.FloatVector(splineY)
    rMonoReg = ro.r['monoreg']
    # do the antitonic regression
    allRres = rMonoReg(rSplineX, rSplineY, type="antitonic")
    rNewSplineY = allRres[3]
    # convert data back to Python format
    newSplineY = []
    diff = []
    diffX = []
    for i in range(len(rNewSplineY)):
        newSplineY.append(rNewSplineY[i])
        if (splineY[i] - newSplineY[i]) > 0:
            diff.append(splineY[i] - newSplineY[i])
            diffX.append(splineX[i])
    # END for
    #print len(splineX)

    residual = sum([i * i for i in (y - ius(x))])

    if visual == True:
        ### Now plot the results
        sys.stderr.write("Plotting %s" % figname + ".png\n")
        plt.clf()
        fig = plt.figure()
        ax = fig.add_subplot(2, 1, 1)
        plt.plot(myUtils.scale_a_list(splineX, toKb),
                 myUtils.scale_a_list(newSplineY, toProb),
                 'g-',
                 label="spline-" + str(passNo),
                 linewidth=2)
        plt.errorbar(myUtils.scale_a_list(x, toKb),
                     myUtils.scale_a_list(y, toProb),
                     myUtils.scale_a_list(yerr, toProb),
                     fmt='r.',
                     label="Mean with std. error",
                     linewidth=2)

        if useInters:
            plt.plot(myUtils.scale_a_list(x, toKb),
                     myUtils.scale_a_list([baselineIntraChrProb for i in x],
                                          toProb),
                     'k-',
                     label="Baseline intra-chromosomal")
            plt.plot(myUtils.scale_a_list(x, toKb),
                     myUtils.scale_a_list([baselineIntraChrProb for i in x],
                                          toProb),
                     'b-',
                     label="Baseline inter-chromosomal")
        plt.ylabel('Contact probability (x10$^{-5}$)', fontsize='large')
        plt.xlabel('Genomic distance (kb)', fontsize='large')
        if distLowThres > -1 and distUpThres > -1:
            plt.xlim(myUtils.scale_a_list([distLowThres, distUpThres], toKb))
        plt.gca().yaxis.set_major_locator(MaxNLocator(nbins=3, prune=None))
        ax.legend(loc="upper right")

        ax = fig.add_subplot(2, 1, 2)

        plt.loglog(splineX, newSplineY, 'g-')
        plt.errorbar(x, y, yerr=yerr, fmt='r.')  # Data
        if useInters:
            plt.loglog(x, [baselineIntraChrProb for i in x], 'k-')
            plt.loglog(x, [baselineIntraChrProb for i in x], 'b-')
        if distLowThres > -1 and distUpThres > -1:
            plt.xlim([distLowThres, distUpThres])
        plt.ylabel('Contact probability (log-scale)', fontsize='large')
        plt.xlabel('Genomic distance (log-scale)', fontsize='large')

        plt.savefig(outdir + '/' + figname + '.png')

    # NOW write the calculated pvalues and corrected pvalues in a file
    infile = gzip.open(infilename, 'r')
    intraInRangeCount = 0
    intraOutOfRangeCount = 0
    intraVeryProximalCount = 0
    interCount = 0
    sys.stderr.write("distLowThres " + repr(distLowThres) + "\tdistUpThres " +
                     repr(distUpThres) + "\n")
    p_vals = []
    q_vals = []
    for line in infile:
        words = line.rstrip().split()
        interxn = myUtils.Interaction(
            [words[0], int(words[1]), words[2],
             int(words[3])])
        interxn.setCount(int(words[4]))
        chr1 = words[0]
        chr2 = words[2]
        midPoint1 = int(words[1])
        midPoint2 = int(words[3])

        bias1 = 1.0
        bias2 = 1.0
        # assumes there is no bias to begin with
        # if the biasDic is not null sets the real bias values
        if len(biasDic) > 0:
            if chr1 in biasDic and midPoint1 in biasDic[chr1]:
                bias1 = biasDic[chr1][midPoint1]
            if chr2 in biasDic and midPoint2 in biasDic[chr2]:
                bias2 = biasDic[chr2][midPoint2]

        if (bias1 < 0 or bias2 < 0) and interxn.type != 'inter':
            prior_p = 1.0
            p_val = 1.0
            p_vals.append(p_val)
        elif interxn.getType(distLowThres, distUpThres) == 'intraInRange':
            # make sure the interaction distance is covered by the probability bins
            distToLookUp = max(interxn.distance, min(x))
            distToLookUp = min(distToLookUp, max(x))
            i = min(bisect.bisect_left(splineX, distToLookUp),
                    len(splineX) - 1)
            #prior_p=newSplineY[i]
            prior_p = newSplineY[i] * (bias1 * bias2
                                       )  # biases added in the picture
            intraInRangeCount += 1
            ############# THIS HAS TO BE interactionCount-1 ##################
            p_val = scsp.bdtrc(interxn.hitCount - 1, observedIntraInRangeSum,
                               prior_p)
            p_vals.append(p_val)

        elif interxn.getType(distLowThres, distUpThres) == 'intraShort':
            prior_p = 1.0
            p_val = 1.0
            intraVeryProximalCount += 1
            p_vals.append(p_val)

        elif interxn.getType(distLowThres, distUpThres) == 'intraLong':
            # out of range bigger than distUpThres
            # use the prior of the baseline intra-chr interaction probability
            prior_p = 1.0  #baselineIntraChrProb*(bias1*bias2)  # biases added in the picture
            p_val = scsp.bdtrc(interxn.hitCount - 1, observedIntraAllSum,
                               prior_p)
            intraOutOfRangeCount += 1
            p_vals.append(p_val)

        else:
            if useInters:
                #prior_p=baselineIntraChrProb
                prior_p = baselineInterChrProb * (
                    bias1 * bias2)  # biases added in the picture
                ############# THIS HAS TO BE interactionCount-1 ##################
                p_val = scsp.bdtrc(interxn.hitCount - 1, observedInterAllSum,
                                   prior_p)
                interCount += 1
                p_vals.append(p_val)
    # END for
    infile.close()

    # Do the BH FDR correction
    if useInters:
        q_vals = myStats.benjamini_hochberg_correction(
            p_vals, possibleInterAllCount + possibleIntraAllCount)
        sys.stderr.write("possibleInterAllCount+possibleIntraAllCount " +
                         repr(possibleInterAllCount + possibleIntraAllCount) +
                         "\n")
    else:
        q_vals = myStats.benjamini_hochberg_correction(
            p_vals, possibleIntraInRangeCount)
        sys.stderr.write("possibleIntraInRangeCount " +
                         repr(possibleIntraInRangeCount) + "\n")

    infile = gzip.open(infilename, 'r')
    outfile = gzip.open(outdir + '/' + figname + '.significances.txt.gz', 'w')
    sys.stderr.write("Writing p-values to file %s" % figname +
                     ".significances.txt.gz\n")
    count = 0
    outfile.write(
        "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n"
    )

    for line in infile:
        words = line.rstrip().split()
        chrNo1 = words[0]
        midPoint1 = int(words[1])
        chrNo2 = words[2]
        midPoint2 = int(words[3])
        interactionCount = int(words[4])
        p_val = p_vals[count]
        q_val = q_vals[count]

        if useInters == False and chrNo1 == chrNo2:  # intra
            interactionDistance = abs(midPoint1 - midPoint2)  # dist
            if myUtils.in_range_check(interactionDistance, distLowThres,
                                      distUpThres):
                outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" %
                              (str(chrNo1), midPoint1, str(chrNo2), midPoint2,
                               interactionCount, p_val, q_val))
        elif useInters == True and chrNo1 != chrNo2:
            outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" %
                          (str(chrNo1), midPoint1, str(chrNo2), midPoint2,
                           interactionCount, p_val, q_val))
        #outfile.write("ALL\t%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val))

        count += 1
    # END for - printing pvals and qvals for all the interactions
    outfile.close()

    isOutlier = []
    distsBelow = []
    distsAbove = []
    intcountsBelow = []
    intcountsAbove = []
    belowThresCount = 0
    aboveThresCount = 0
    outlierThres = 1.0 / possibleIntraInRangeCount
    for interactionDistance, interactionCount, bias12 in sortedInteractions:
        # make sure the interaction distance is covered by the probability bins
        distToLookUp = max(interactionDistance, min(x))
        distToLookUp = min(distToLookUp, max(x))
        i = min(bisect.bisect_left(splineX, distToLookUp), len(splineX) - 1)
        prior_p = newSplineY[i] * float(bias12)  # biases added in the picture
        ############# THIS HAS TO BE interactionCount-1 ##################
        p_val = scsp.bdtrc(interactionCount - 1, observedIntraInRangeSum,
                           prior_p)
        if p_val < outlierThres:
            distsBelow.append(interactionDistance)
            intcountsBelow.append(interactionCount)
            isOutlier.append(1)
            belowThresCount += 1
        else:
            distsAbove.append(interactionDistance)
            intcountsAbove.append(interactionCount)
            isOutlier.append(0)
            aboveThresCount += 1
    # END for - doing the outlier check for all interactions in sortedInteractions

    if visual == True:
        sys.stderr.write("Plotting results of extracting outliers to file %s" %
                         figname + ".extractOutliers.png\n")
        plt.clf()
        fig = plt.figure()
        ax = fig.add_subplot(111)
        downsample = 30  # for the non-outliers
        randIndcsAbove = sample([i for i in range(len(intcountsAbove))],
                                len(intcountsAbove) / downsample)
        randIndcsAbove = sorted(randIndcsAbove)
        downsample = 20  # for the outliers
        randIndcsBelow = sample([i for i in range(len(intcountsBelow))],
                                len(intcountsBelow) / downsample)
        randIndcsBelow = sorted(randIndcsBelow)

        plt.plot(myUtils.scale_a_list([distsBelow[i] for i in randIndcsBelow],
                                      toKb),
                 [intcountsBelow[i] for i in randIndcsBelow],
                 'r.',
                 label="Outliers (p-value < 1/M)")
        plt.plot(myUtils.scale_a_list(splineX + [maxObservedGenomicDist],
                                      toKb),
                 [
                     newSplineY[i] * observedIntraInRangeSum
                     for i in range(len(newSplineY))
                 ] + [newSplineY[-1] * observedIntraInRangeSum],
                 'g-',
                 label="spline-" + str(passNo) + " (x N)",
                 linewidth=2.5)

        plt.xlabel('Genomic distance (kb)')
        plt.ylabel('Contact counts')
        print(repr(len(intcountsBelow)) + "\t"),
        ## this limits y-axis of the hit count plots
        if len(intcountsBelow) > 0:
            plt.ylim([0, min(max(intcountsBelow), 1500)])
        if distLowThres > -1 and distUpThres > -1:
            plt.xlim([0, distUpThres * toKb])
        ax.legend(loc="upper right", fancybox=True)
        plt.savefig(outdir + '/' + figname + '.extractOutliers.png')

    sys.stderr.write("intraInRangeCount " + repr(intraInRangeCount)+"\tintraOutOfRangeCount " +\
     repr(intraOutOfRangeCount)+"\tintraVeryProximalCount " + repr(intraVeryProximalCount) +"\tinterCount " + repr(interCount)+"\n")

    if visual == True:
        sys.stderr.write("Plotting q-values to file %s" % figname +
                         ".qplot.png\n")
    minFDR = 0.0
    maxFDR = 0.05
    increment = 0.001
    FDRx, FDRy = plot_qvalues(q_vals, minFDR, maxFDR, increment,
                              figname + ".qplot")

    infile.close()

    return [splineX, newSplineY, residual, isOutlier, FDRx,
            FDRy]  # from fit_Spline

Example #8

Show file

File: fithic.py Project: ay-lab/fithic

def fit_Spline(mainDic,x,y,yerr,infilename,outfilename,biasDic,outliersline,outliersdist,observedIntraInRangeSum, possibleIntraInRangeCount, possibleInterAllCount, observedIntraAllSum, observedInterAllSum, resolution, passNo):
    with open(logfile, 'a') as log:
        log.write("\nFitting a univariate spline to the probability means\n"),
        log.write("------------------------------------------------------------------------------------\n"),
   
    splineX = None
    newSplineY = None
    residual = None 
    FDRx = None
    FDRy = None

    if not interOnly:
        if outliersdist != None:
            y = [f for _, f in sorted(zip(x,y), key=lambda pair: pair[0])]
            x.sort()
        for i in range(1,len(x)):
            if x[i]<=x[i-1]:
                print("ERROR in spline fitting. Distances do not decrease across bins. Ensure interaction file is correct.")
                print("Avg. distance of bin(i-1)... %s" % x[i-1])
                print("Avg. distance of bin(i)... %s" % x[i])
                sys.exit(2)
        
        # maximum residual allowed for spline is set to min(y)^2
        splineError=min(y)*min(y)

        # use fitpack2 method -fit on the real x and y from equal occupancy binning
        ius = UnivariateSpline(x, y, s=splineError)
        tempMaxX=max(x)
        tempMinX=min(x)
        tempList=sorted([dis for dis in mainDic])
        splineX=[]
        ### The below for loop will make sure nothing is out of range of [min(x) max(x)]
        ### Therefore everything will be within the range where the spline is defined
        for i in tempList:
            if tempMinX<=i<=tempMaxX:
                splineX.append(i)
        splineY=ius(splineX)
        #print(splineY)
        #print(yerr)


        ir = IsotonicRegression(increasing=False)
        newSplineY = ir.fit_transform(splineX,splineY)
        #print(newSplineY)
        residual =sum([i*i for i in (y - ius(x))])

        if visual==True:
            xi = np.linspace(min(x),max(x),5*len(x))
            yi = ius(xi)

            print("Plotting %s" % (outfilename + ".png"))
            plt.clf()
            fig = plt.figure()
            ax = fig.add_subplot(2,1,1)
            plt.plot(myUtils.scale_a_list(splineX,toKb), myUtils.scale_a_list(newSplineY,toProb),'g-',label="spline-"+str(passNo),linewidth=2)
            plt.errorbar(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list(y,toProb),myUtils.scale_a_list(yerr,toProb),fmt='r.',label="Mean with std. error",linewidth=2) 
        
            #plt.ylabel('Contact probability (x10$^{-5}$)',fontsize='large')
            #plt.xlabel('Genomic distance (kb)',fontsize='large')
            plt.ylabel('Contact probability (x10$^{-5}$)')
            plt.xlabel('Genomic distance (kb)')
            if distLowThres>0 and distUpThres<float("inf"):
                plt.xlim(myUtils.scale_a_list([distLowThres, distUpThres],toKb))
            plt.gca().yaxis.set_major_locator( MaxNLocator(nbins = 3, prune=None))
            ax.legend(loc="upper right")

            ax = fig.add_subplot(2,1,2)

            plt.loglog(splineX,newSplineY,'g-')
            plt.errorbar(x, y, yerr=yerr, fmt='r.') # Data
            if distLowThres>0 and distUpThres<float("inf"):
                plt.xlim([distLowThres, distUpThres])
            plt.ylabel('Contact probability (log-scale)')
            plt.xlabel('Genomic distance (log-scale)')

            plt.savefig(outfilename+'.png')
            

    # NOW write the calculated pvalues and corrected pvalues in a file
    infile = gzip.open(infilename, 'rt')
    intraInRangeCount=0
    intraOutOfRangeCount=0
    intraVeryProximalCount=0
    interCount=0
    discardCount=0
    p_vals=[]
    q_vals=[]
    biasl=[]
    biasr=[]
    for line in infile:
        ch1,mid1,ch2,mid2,contactCount=line.rstrip().split()
        contactCount = float(contactCount)
        interxn=myUtils.Interaction([ch1, int(mid1), ch2, int(mid2)])
        interxn.setCount(contactCount)
        mid1 = int(mid1); mid2 = int(mid2)
        interactionType = interxn.getType(distLowThres,distUpThres)
        bias1=1.0; bias2=1.0;  # assumes there is no bias to begin with
        # if the biasDic is not null sets the real bias values
        if biasDic:
            if ch1 not in biasDic:
                print("Warning. Bias file does not contain chromosome %s. \
                Please ensure you're using correct file. Fit-Hi-C will continue with\
                bias = -1 for this locus" % ch1)
                bias1 = -1
            else:
                if mid1 not in biasDic[ch1]:
                    print("Error. Bias file does not contain midpoint %s within \
                    %s. Please ensure you're using the correct file and/or resolution \
                    argument. Fit-Hi-C will continue with bias = -1 for this locus" \
                    % (mid1, ch1))
                    bias1 = -1
                else: 
                    bias1=biasDic[ch1][mid1]
            if ch2 not in biasDic:
                print("Warning. Bias file does not contain chromosome %s. \
                Please ensure you're using correct file. Fit-Hi-C will continue with\
                bias = -1 for this locus" % ch2)
                bias2 = -1
            else:
                if mid2 not in biasDic[ch2]:
                    print("Error. Bias file does not contain midpoint %s within \
                    %s. Please ensure you're using the correct file and/or resolution \
                    argument. Fit-Hi-C will continue with bias = -1 for this locus" \
                    % (mid2, ch2))
                    bias2 = -1
                else:
                    bias2=biasDic[ch2][mid2]
        biasl.append(bias1)
        biasr.append(bias2)
        if (bias1<0 or bias2<0) and interactionType !='inter':
            prior_p=1.0
            p_val=1.0
            discardCount+=1
        elif interactionType=='intraInRange' and not interOnly:
            distToLookUp=max(interxn.getDistance(),min(x))
            distToLookUp=min(distToLookUp,max(x))
            i=min(bisect.bisect_left(splineX, distToLookUp),len(splineX)-1)
            prior_p=newSplineY[i]*(bias1*bias2) 
            p_val=scsp.bdtrc(interxn.getCount()-1,observedIntraInRangeSum,prior_p)
            intraInRangeCount +=1
        elif interactionType =='intraShort' and not interOnly:
            prior_p=1.0
            p_val=1.0
            intraVeryProximalCount += 1
        elif interactionType =='intraLong' and not interOnly:
            prior_p=1.0
            #p_val=scsp.bdtrc(interxn.getCount()-1, observedIntraAllSum,prior_p) ##RUNBY
            p_val=1.0
            intraOutOfRangeCount += 1
        else:
            if allReg or interOnly:
                prior_p=interChrProb*(bias1*bias2)
                p_val=scsp.bdtrc(interxn.getCount()-1,observedInterAllSum,prior_p)
                interCount += 1
            else:
                p_val=1.0
                #p_vals.append(p_val)
        p_vals.append(p_val)
    infile.close()

    outlierThres = 0
    # Do the BH FDR correction
    if allReg:
        outlierThres=1.0/(possibleIntraInRangeCount+possibleInterAllCount)
        q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount+possibleIntraInRangeCount)
    elif interOnly and not allReg:
        outlierThres = 1.0/possibleInterAllCount
        q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount)
    else:
        outlierThres = 1.0/possibleIntraInRangeCount
        q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleIntraInRangeCount)
    print("Outlier threshold is... %s" % (outlierThres))

    #now we write the values back to the file
    infile =gzip.open(infilename, 'rt')
    if resolution:
        outfile =gzip.open(outfilename+'.res'+str(resolution)+'.significances.txt.gz', 'wt')
    else:
        outfile =gzip.open(outfilename+'.significances.txt.gz', 'wt')
    print("Writing p-values and q-values to file %s" % (outfilename + ".significances.txt"))
    outfile.write("chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\tbias1\tbias2\n")
    count=0
    for line in infile:
        words=line.rstrip().split()
        chr1=words[0]
        midPoint1=int(words[1])
        chr2=words[2]
        midPoint2=int(words[3])
        interactionCount=float(words[4])
        p_val=p_vals[count]
        q_val=q_vals[count]
        bias1=biasl[count]
        bias2=biasr[count]
        
        if (allReg or interOnly) and chr1!=chr2:
            outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" % (str(chr1), midPoint1, str(chr2), midPoint2, interactionCount, p_val, q_val, bias1, bias2))
        if (allReg or not interOnly) and chr1==chr2:
            interactionDistance = abs(midPoint1-midPoint2)
            if myUtils.in_range_check(interactionDistance,distLowThres, distUpThres):
                outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" % (str(chr1), midPoint1, str(chr2), midPoint2, interactionCount, p_val, q_val, bias1, bias2))
        
        if p_val<outlierThres:
            outliersline.add(count)
            outliersdist.add(abs(midPoint1-midPoint2))
        count+=1
    outfile.close()
    infile.close()
    if visual == True:
        print("Plotting q-values to file %s" % outfilename + ".qplot.png")
    minFDR=0.0
    maxFDR=0.05
    increment=0.001
    FDRx,FDRy=plot_qvalues(q_vals,minFDR,maxFDR,increment,outfilename+".qplot")
        
    with open(logfile, 'a') as log:
        log.write("Spline successfully fit\n"),
        log.write("\n"),
        log.write("\n"),

    return [splineX, newSplineY, residual, outliersline, outliersdist, FDRx, FDRy] # from fit_Spline

Example #9

Show file

File: fit-hi-c.py Project: jmrinaldi/HiC-pipeline

def calculate_Probabilities(sortedInteractions,isOutlier,figname):

	sys.stderr.write("\nCalculating probability means and standard deviations by equal occupancy binning of interaction data\n")
	sys.stderr.write("------------------------------------------------------------------------------------\n")
	
	outfile =open(outdir+'/'+figname+'.txt', 'w')
	
	# total interaction count to put on top of the plot
	# this may be different than observedIntraInRangeSum for the second iteration of fit-hic
	totalInteractionCountForPlot=0
	lcount=0
	for eachrow in sortedInteractions:
		if isOutlier[lcount]==0:
			totalInteractionCountForPlot += eachrow[1]
		lcount+=1
	# END for
	desiredPerBin=(observedIntraInRangeSum)/noOfBins
	sys.stderr.write("observedIntraInRangeSum\t"+repr(observedIntraInRangeSum)+ "\tdesiredPerBin\t" +repr(desiredPerBin)+"\tnoOfBins\t"+repr(noOfBins)+"\n")

	# the following five lists will be the print outputs
	x=[] # avg genomic distances of bins
	y=[] # avg interaction probabilities of bins
	yerr=[] # stderrs of bins
	pairCounts=[] # number of pairs in bins
	interactionTotals=[] # number of interactions (reads) in bins

	# the following variables will be used to calculate the above five lists
	noOfPairsForBin=0
	meanCountPerPair=0
	M2=0
	interactionTotalForBin=0
	interactionTotalForBinTermination=0
	distanceTotalForBin=0
	lastDistanceForBin=-1
	lastInteraction=lcount 
	lcount=0 # this will increase by eachrow in sortedInteractions

	for eachrow in sortedInteractions:
		interactionDistance=eachrow[0]
		interactionCount=eachrow[1]
 
 		# if one bin is full or it's the last bin
		if noOfPairsForBin>0 and ((useBinning==False and lastDistanceForBin!=-1 and lastDistanceForBin!=interactionDistance) or\
			(useBinning==True and lastDistanceForBin!=-1 and interactionTotalForBinTermination >= desiredPerBin and\
			lastDistanceForBin!=interactionDistance) or lcount==lastInteraction): 

			# calculate the things that need to be calculated
			avgDistance=(distanceTotalForBin/noOfPairsForBin)*distScaling
			meanProbabilityObsv=(meanCountPerPair*1.0)/observedIntraInRangeSum
			se_p=meanProbabilityObsv
			# update se_p if there are more than 1 pairs in the bin
			if noOfPairsForBin>1:
				var=M2/(noOfPairsForBin-1)
				sd=math.sqrt(var)
				se=sd/math.sqrt(noOfPairsForBin)
				se_p=se/observedIntraInRangeSum
			# END if

			# append the calculated vals to corresponding lists
			x.append(float(avgDistance))
			y.append(float(meanProbabilityObsv))
			yerr.append(float(se_p))
			pairCounts.append(noOfPairsForBin)
			interactionTotals.append(interactionTotalForBin)
	
			# now that we saved what we need
			# set the values back to defaults and go on to the next bin
			noOfPairsForBin=0
			meanCountPerPair=0
			M2=0
			interactionTotalForBin=0
			interactionTotalForBinTermination=0
			distanceTotalForBin=0
			lastDistanceForBin=-1
		# END if - that checks whether the bin is full etc.

		# Now go back to processing the read values of interactionDistance and interactionCount
		# this check is necessary for the second pass of fit-hic
		# we want to only use the non-outlier interactions in our
		# probability calculation
		if isOutlier[lcount]==0:
			distanceTotalForBin +=interactionDistance/distScaling
			interactionTotalForBin +=interactionCount
			noOfPairsForBin +=1
			delta=interactionCount-meanCountPerPair
			meanCountPerPair += (delta*1.0) / noOfPairsForBin
			M2 +=delta*(interactionCount-meanCountPerPair)
		# END if
		interactionTotalForBinTermination +=interactionCount
		lcount +=1
		lastDistanceForBin=interactionDistance
	# END for over sortedInteractions

	if visual==True:
		sys.stderr.write("Plotting %s" % figname + ".png\n")
		plt.clf()
		fig = plt.figure()
		ax = fig.add_subplot(111)
		plt.plot(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list(y,toProb),'ro',label="Mean")
		plt.errorbar(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list(y,toProb),myUtils.scale_a_list(yerr,toProb),fmt='k.', label="Standard error")
		#plt.ylabel('Probability (1e-5)')
		plt.ylabel('Contact probability (x10$^{-5}$)')
		plt.xlabel('Genomic distance (kb)')
		titleStr='Binning observed interactions using equal occupancy bins.\n No. of bins: '\
			+str(noOfBins) +', Library: ' + str(libname)+ ', No. of interactions: ' +str(observedIntraInRangeSum)
		plt.title(titleStr,size='small')
		ax.legend(loc="upper right")
		plt.savefig(outdir+'/'+figname+'.png')

	sys.stderr.write("Writing %s" % figname + ".txt\n")
	
	outfile.write("avgGenomicDist\tcontactProbability\tstandardError\tnoOfLocusPairs\ttotalOfContactCounts\n")
	for i in range(len(x)):
		outfile.write("%d" % x[i] + "\t"+"%.2e" % y[i]+ "\t" + "%.2e" % yerr[i] + "\t" +"%d" % pairCounts[i] + "\t" +"%d" % interactionTotals[i]+"\n")
	outfile.close()
	return [x,y,yerr] # from calculate_Probabilities

Example #10

Show file

File: fit-hi-c.py Project: jmrinaldi/HiC-pipeline

def fit_Spline(x,y,yerr,infilename,sortedInteractions,biasDic,figname,passNo):
	sys.stderr.write("\nFit a univariate spline to the probability means\n")
	sys.stderr.write("------------------------------------------------------------------------------------\n")
	sys.stderr.write("baseline intra-chr probability: " + repr(baselineIntraChrProb)+ "\tbaseline inter-chr probability: " + repr(baselineInterChrProb)+"\n")
	# xi and yi will be used only for visualization purposes
	# acutal fit and residual is all done on vectors x and y
	xi = np.linspace(min(x), max(x), overSample*len(x))

	# assume residualFactor==-1: 
	splineError=min(y)*min(y)

	# use fitpack2 method -fit on the real x and y from equal occupancy binning
	ius = UnivariateSpline(x, y, s=splineError)
	yi = ius(xi)

	#### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING
	### NOW I DO THIS BY CALLING AN R function CALLED MONOREG 
	### This does the isotonic regression using option antitonic to make sure 
	### I get monotonically decreasing probabilites with increasion genomic distance 

	tempMaxX=max(x)
	tempMinX=min(x)
	tempList=sorted(list(set([int(i[0]) for i in sortedInteractions])))
	splineX=[]
	### The below for loop will make sure nothing is out of range of [min(x) max(x)]
	### Therefore everything will be within the range where the spline is defined
	for i in tempList:
		if tempMinX<=i and i<=tempMaxX:
			splineX.append(i)
	# END for
	#print len(splineX)
	splineY=ius(splineX)

	# R vector format
	rSplineX=ro.FloatVector(splineX)
	rSplineY=ro.FloatVector(splineY)
	rMonoReg=ro.r['monoreg']
	# do the antitonic regression
	allRres=rMonoReg(rSplineX,rSplineY,type="antitonic")
	rNewSplineY=allRres[3]
	# convert data back to Python format
	newSplineY=[]
	diff=[]
	diffX=[]
	for i in range(len(rNewSplineY)):
		newSplineY.append(rNewSplineY[i])
		if (splineY[i]-newSplineY[i]) > 0:
			diff.append(splineY[i]-newSplineY[i])
			diffX.append(splineX[i])
	# END for
	#print len(splineX)
	
	residual =sum([i*i for i in (y - ius(x))])

	if visual==True:
		### Now plot the results
		sys.stderr.write("Plotting %s" % figname + ".png\n")
		plt.clf()
		fig = plt.figure()
		ax = fig.add_subplot(2,1,1)
		plt.plot(myUtils.scale_a_list(splineX,toKb), myUtils.scale_a_list(newSplineY,toProb),'g-',label="spline-"+str(passNo),linewidth=2)
		plt.errorbar(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list(y,toProb),myUtils.scale_a_list(yerr,toProb),fmt='r.',label="Mean with std. error",linewidth=2) 

		if useInters:
			plt.plot(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list([baselineIntraChrProb for i in x],toProb),'k-',label="Baseline intra-chromosomal")
			plt.plot(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list([baselineIntraChrProb for i in x],toProb),'b-',label="Baseline inter-chromosomal")
		plt.ylabel('Contact probability (x10$^{-5}$)',fontsize='large')
		plt.xlabel('Genomic distance (kb)',fontsize='large')
		if distLowThres>-1 and distUpThres>-1:
			plt.xlim(myUtils.scale_a_list([distLowThres, distUpThres],toKb))
		plt.gca().yaxis.set_major_locator( MaxNLocator(nbins = 3, prune=None))
		ax.legend(loc="upper right")

		ax = fig.add_subplot(2,1,2)

		plt.loglog(splineX,newSplineY,'g-')
		plt.errorbar(x, y, yerr=yerr, fmt='r.') # Data
		if useInters:
			plt.loglog(x,[baselineIntraChrProb for i in x],'k-')
			plt.loglog(x,[baselineIntraChrProb for i in x],'b-')
		if distLowThres>-1 and distUpThres>-1:
			plt.xlim([distLowThres, distUpThres])
		plt.ylabel('Contact probability (log-scale)',fontsize='large')
		plt.xlabel('Genomic distance (log-scale)',fontsize='large')

		plt.savefig(outdir+'/'+figname+'.png')

	# NOW write the calculated pvalues and corrected pvalues in a file 
	infile =gzip.open(infilename, 'r')
	intraInRangeCount=0
	intraOutOfRangeCount=0
	intraVeryProximalCount=0
	interCount=0
	sys.stderr.write("distLowThres " + repr(distLowThres) + "\tdistUpThres " + repr(distUpThres) +"\n")
	p_vals=[]
	q_vals=[]
	for line in infile:
		words=line.rstrip().split()
		interxn=myUtils.Interaction([words[0], int(words[1]), words[2], int(words[3])])
		interxn.setCount(int(words[4]))
		chr1=words[0]
		chr2=words[2]
		midPoint1=int(words[1])
		midPoint2=int(words[3])

		bias1=1.0; bias2=1.0;  # assumes there is no bias to begin with
		# if the biasDic is not null sets the real bias values
		if len(biasDic)>0:
			if chr1 in biasDic and midPoint1 in biasDic[chr1]:
				bias1=biasDic[chr1][midPoint1]
			if chr2 in biasDic and midPoint2 in biasDic[chr2]:
				bias2=biasDic[chr2][midPoint2]

		if (bias1<0 or bias2<0) and interxn.type!='inter':
			prior_p=1.0
			p_val=1.0
			p_vals.append(p_val)
		elif interxn.getType(distLowThres,distUpThres)=='intraInRange': 
			# make sure the interaction distance is covered by the probability bins
			distToLookUp=max(interxn.distance,min(x))
			distToLookUp=min(distToLookUp,max(x))
			i=min(bisect.bisect_left(splineX, distToLookUp),len(splineX)-1) 
			#prior_p=newSplineY[i]
			prior_p=newSplineY[i]*(bias1*bias2) # biases added in the picture
			intraInRangeCount +=1
			############# THIS HAS TO BE interactionCount-1 ##################
			p_val=scsp.bdtrc(interxn.hitCount-1,observedIntraInRangeSum,prior_p)
			p_vals.append(p_val)

		elif interxn.getType(distLowThres,distUpThres)=='intraShort':
			prior_p=1.0
			p_val=1.0
			intraVeryProximalCount +=1
			p_vals.append(p_val)

		elif interxn.getType(distLowThres,distUpThres)=='intraLong':
			# out of range bigger than distUpThres
			# use the prior of the baseline intra-chr interaction probability
			prior_p=1.0 #baselineIntraChrProb*(bias1*bias2)  # biases added in the picture
			p_val=scsp.bdtrc(interxn.hitCount-1,observedIntraAllSum,prior_p)
			intraOutOfRangeCount +=1
			p_vals.append(p_val)

		else:
			if useInters:
				#prior_p=baselineIntraChrProb
				prior_p=baselineInterChrProb*(bias1*bias2) # biases added in the picture
				############# THIS HAS TO BE interactionCount-1 ##################
				p_val=scsp.bdtrc(interxn.hitCount-1,observedInterAllSum,prior_p)
				interCount +=1
				p_vals.append(p_val)
	# END for
	infile.close()

	# Do the BH FDR correction 
	if useInters:
		q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount+possibleIntraAllCount)
		sys.stderr.write("possibleInterAllCount+possibleIntraAllCount " + repr(possibleInterAllCount+possibleIntraAllCount)+"\n")
	else:
		q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleIntraInRangeCount)
		sys.stderr.write("possibleIntraInRangeCount " + repr(possibleIntraInRangeCount)+"\n")

	infile =gzip.open(infilename, 'r')
	outfile =gzip.open(outdir+'/'+figname+'.significances.txt.gz', 'w')
	sys.stderr.write("Writing p-values to file %s" % figname + ".significances.txt.gz\n")
	count=0
	outfile.write("chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n")

	for line in infile:
		words=line.rstrip().split()
		chrNo1=words[0]
		midPoint1=int(words[1])
		chrNo2=words[2]
		midPoint2=int(words[3])
		interactionCount=int(words[4])
		p_val=p_vals[count]
		q_val=q_vals[count]
		
		if useInters==False and chrNo1==chrNo2: # intra
			interactionDistance=abs(midPoint1-midPoint2) # dist 
			if myUtils.in_range_check(interactionDistance,distLowThres,distUpThres):
				outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val))
		elif useInters==True and chrNo1!=chrNo2:
			outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val))
		#outfile.write("ALL\t%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val))

		count+=1
	# END for - printing pvals and qvals for all the interactions
	outfile.close()

	isOutlier=[]
	distsBelow=[]
	distsAbove=[]
	intcountsBelow=[]
	intcountsAbove=[]
	belowThresCount=0
	aboveThresCount=0
	outlierThres=1.0/possibleIntraInRangeCount
	for interactionDistance,interactionCount,bias12 in sortedInteractions:
		# make sure the interaction distance is covered by the probability bins
		distToLookUp=max(interactionDistance,min(x))
		distToLookUp=min(distToLookUp,max(x))
		i=min(bisect.bisect_left(splineX, distToLookUp),len(splineX)-1) 
		prior_p=newSplineY[i]*float(bias12) # biases added in the picture
		############# THIS HAS TO BE interactionCount-1 ##################
		p_val=scsp.bdtrc(interactionCount-1,observedIntraInRangeSum,prior_p)
		if p_val < outlierThres:
			distsBelow.append(interactionDistance)
			intcountsBelow.append(interactionCount)
			isOutlier.append(1)
			belowThresCount +=1
		else:
			distsAbove.append(interactionDistance)
			intcountsAbove.append(interactionCount)
			isOutlier.append(0)
			aboveThresCount +=1
	# END for - doing the outlier check for all interactions in sortedInteractions


	if visual==True:
		sys.stderr.write("Plotting results of extracting outliers to file %s" % figname + ".extractOutliers.png\n")
		plt.clf()
		fig = plt.figure()
		ax = fig.add_subplot(111)
		downsample=30 # for the non-outliers
		randIndcsAbove=sample([i for i in range(len(intcountsAbove))],len(intcountsAbove)/downsample)
		randIndcsAbove=sorted(randIndcsAbove)
		downsample=20 # for the outliers
		randIndcsBelow=sample([i for i in range(len(intcountsBelow))],len(intcountsBelow)/downsample)
		randIndcsBelow=sorted(randIndcsBelow)

		plt.plot(myUtils.scale_a_list([distsBelow[i] for i in randIndcsBelow],toKb),[intcountsBelow[i] for i in randIndcsBelow], 'r.',label="Outliers (p-value < 1/M)")
		plt.plot(myUtils.scale_a_list(splineX+[maxObservedGenomicDist],toKb),[newSplineY[i]*observedIntraInRangeSum	for i in range(len(newSplineY))]+[newSplineY[-1]*observedIntraInRangeSum], 'g-', label="spline-"+str(passNo)+" (x N)", linewidth=2.5)

		plt.xlabel('Genomic distance (kb)')
		plt.ylabel('Contact counts')
		print(repr(len(intcountsBelow))+"\t"),
		## this limits y-axis of the hit count plots
		if len(intcountsBelow)>0:
			plt.ylim([0,min(max(intcountsBelow),1500)])
		if distLowThres>-1 and distUpThres>-1:
			plt.xlim([0, distUpThres*toKb])
		ax.legend(loc="upper right",fancybox=True)
		plt.savefig(outdir+'/'+figname+'.extractOutliers.png')

	sys.stderr.write("intraInRangeCount " + repr(intraInRangeCount)+"\tintraOutOfRangeCount " +\
		repr(intraOutOfRangeCount)+"\tintraVeryProximalCount " + repr(intraVeryProximalCount) +"\tinterCount " + repr(interCount)+"\n")

	if visual==True:
		sys.stderr.write("Plotting q-values to file %s" % figname + ".qplot.png\n")
	minFDR=0.0
	maxFDR=0.05
	increment=0.001
	FDRx,FDRy=plot_qvalues(q_vals,minFDR,maxFDR,increment,figname+".qplot")

	infile.close()

	return [splineX, newSplineY, residual, isOutlier, FDRx, FDRy] # from fit_Spline