Beispiel #1
0
def read_Interactions(contactCountsFile, biasFile, outliers=None):
    mainDic={}
    print("Reading the contact counts file to generate bins...")
    startT = time.time()

    observedInterAllSum=0 #used
    observedIntraAllSum=0 #used
    observedInterAllCount=0
    observedIntraAllCount=0 #notused
    observedIntraInRangeSum=0 #used
    observedIntraInRangeCount=0 #notused
    minObservedGenomicDist=float('inf') #notused
    maxObservedGenomicDist=0 #notused

    linectr = 0
    outlierposctr = 0
    #Loop through every line in the contactCountsFile
    with gzip.open(contactCountsFile, 'rt') as f:
        for lines in f:
            if outliers != None and outlierposctr<len(outliers):
                if linectr == outliers[outlierposctr]:
                    linectr+=1
                    outlierposctr+=1
                    continue
            ch1,mid1,ch2,mid2,contactCount=lines.split()
            #create the interaction
            contactCount=float(contactCount)
            interxn=myUtils.Interaction([ch1, int(mid1), ch2, int(mid2)])
            interxn.setCount(contactCount)
            interactionType = interxn.getType(distLowThres,distUpThres)
            if interactionType=='inter':
                observedInterAllSum += interxn.getCount()
                observedInterAllCount +=1
            else: # any type of intra
                observedIntraAllSum +=interxn.getCount()
                observedIntraAllCount +=1
                if interactionType=='intraInRange':
                    #interxn.setDistance(interxn.getDistance()+(1000-interxn.getDistance()) % 1000)
                    minObservedGenomicDist=min(minObservedGenomicDist,interxn.getDistance())
                    maxObservedGenomicDist=max(maxObservedGenomicDist,interxn.getDistance())
                    if interxn.getDistance() not in mainDic:
                        mainDic[interxn.getDistance()] = [0,0]
                    mainDic[interxn.getDistance()][1]+=interxn.getCount()
                    observedIntraInRangeSum +=interxn.getCount()
                    observedIntraInRangeCount +=1
            linectr+=1
    endT = time.time()
    print("Interactions file read. Time took %s" % (endT-startT))
    with open(logfile, 'w') as log:
        log.write("\n\nInteractions file read successfully\n")
        log.write("------------------------------------------------------------------------------------\n")
        log.write("Observed, Intra-chr in range: pairs= "+str(observedIntraInRangeCount) +"\t totalCount= "+str(observedIntraInRangeSum)+"\n")
        log.write("Observed, Intra-chr all: pairs= "+str(observedIntraAllCount) +"\t totalCount= "+str(observedIntraAllSum)+"\n")
        log.write("Observed, Inter-chr all: pairs= "+str(observedInterAllCount) +"\t totalCount= "+str(observedInterAllSum)+"\n")
        log.write("Range of observed genomic distances [%s %s]" % (minObservedGenomicDist,maxObservedGenomicDist) + "\n"),
        log.write("\n")
    return (mainDic,observedInterAllSum,observedIntraAllSum,observedIntraInRangeSum) # from read_Interactions
Beispiel #2
0
def fit_Spline(mainDic, x, y, yerr, infilename, outfilename, biasDic,
               outliersline, outliersdist, observedIntraInRangeSum,
               possibleIntraInRangeCount, possibleInterAllCount,
               observedIntraAllSum, observedInterAllSum, resolution, passNo):
    with open(logfile, 'a') as log:
        log.write("\nFitting a univariate spline to the probability means\n"),
        log.write(
            "------------------------------------------------------------------------------------\n"
        ),

    splineX = None
    newSplineY = None
    residual = None
    FDRx = None
    FDRy = None

    if not interOnly:
        if outliersdist != None:
            y = [f for _, f in sorted(zip(x, y), key=lambda pair: pair[0])]
            x.sort()
        for i in range(1, len(x)):
            if x[i] <= x[i - 1]:
                print(
                    "ERROR in spline fitting. Distances do not decrease across bins. Ensure interaction file is correct."
                )
                print("Avg. distance of bin(i-1)... %s" % x[i - 1])
                print("Avg. distance of bin(i)... %s" % x[i])
                sys.exit(2)

        # maximum residual allowed for spline is set to min(y)^2
        splineError = min(y) * min(y)

        # use fitpack2 method -fit on the real x and y from equal occupancy binning
        ius = UnivariateSpline(x, y, s=splineError)
        tempMaxX = max(x)
        tempMinX = min(x)
        tempList = sorted([dis for dis in mainDic])
        splineX = []
        ### The below for loop will make sure nothing is out of range of [min(x) max(x)]
        ### Therefore everything will be within the range where the spline is defined
        for i in tempList:
            if tempMinX <= i <= tempMaxX:
                splineX.append(i)
        splineY = ius(splineX)
        #print(splineY)
        #print(yerr)

        ir = IsotonicRegression(increasing=False)
        newSplineY = ir.fit_transform(splineX, splineY)
        #print(newSplineY)
        residual = sum([i * i for i in (y - ius(x))])

        if visual == True:
            xi = np.linspace(min(x), max(x), 5 * len(x))
            yi = ius(xi)

            print("Plotting %s" % (outfilename + ".png"))
            plt.clf()
            fig = plt.figure()
            ax = fig.add_subplot(2, 1, 1)
            plt.plot(myUtils.scale_a_list(splineX, toKb),
                     myUtils.scale_a_list(newSplineY, toProb),
                     'g-',
                     label="spline-" + str(passNo),
                     linewidth=2)
            plt.errorbar(myUtils.scale_a_list(x, toKb),
                         myUtils.scale_a_list(y, toProb),
                         myUtils.scale_a_list(yerr, toProb),
                         fmt='r.',
                         label="Mean with std. error",
                         linewidth=2)

            #plt.ylabel('Contact probability (x10$^{-5}$)',fontsize='large')
            #plt.xlabel('Genomic distance (kb)',fontsize='large')
            plt.ylabel('Contact probability (x10$^{-5}$)')
            plt.xlabel('Genomic distance (kb)')
            if distLowThres > 0 and distUpThres < float("inf"):
                plt.xlim(
                    myUtils.scale_a_list([distLowThres, distUpThres], toKb))
            plt.gca().yaxis.set_major_locator(MaxNLocator(nbins=3, prune=None))
            ax.legend(loc="upper right")

            ax = fig.add_subplot(2, 1, 2)

            plt.loglog(splineX, newSplineY, 'g-')
            plt.errorbar(x, y, yerr=yerr, fmt='r.')  # Data
            if distLowThres > 0 and distUpThres < float("inf"):
                plt.xlim([distLowThres, distUpThres])
            plt.ylabel('Contact probability (log-scale)')
            plt.xlabel('Genomic distance (log-scale)')

            plt.savefig(outfilename + '.png')

    # NOW write the calculated pvalues and corrected pvalues in a file
    infile = gzip.open(infilename, 'rt')
    intraInRangeCount = 0
    intraOutOfRangeCount = 0
    intraVeryProximalCount = 0
    interCount = 0
    discardCount = 0
    p_vals = []
    q_vals = []
    biasl = []
    biasr = []
    for line in infile:
        ch1, mid1, ch2, mid2, contactCount = line.rstrip().split()
        contactCount = float(contactCount)
        interxn = myUtils.Interaction([ch1, int(mid1), ch2, int(mid2)])
        interxn.setCount(contactCount)
        mid1 = int(mid1)
        mid2 = int(mid2)
        interactionType = interxn.getType(distLowThres, distUpThres)
        bias1 = 1.0
        bias2 = 1.0
        # assumes there is no bias to begin with
        # if the biasDic is not null sets the real bias values
        if biasDic:
            if ch1 in biasDic and mid1 in biasDic[ch1]:
                bias1 = biasDic[ch1][mid1]
            if ch2 in biasDic and mid2 in biasDic[ch2]:
                bias2 = biasDic[ch2][mid2]
        biasl.append(bias1)
        biasr.append(bias2)
        if (bias1 < 0 or bias2 < 0) and interactionType != 'inter':
            prior_p = 1.0
            p_val = 1.0
            discardCount += 1
        elif interactionType == 'intraInRange' and not interOnly:
            distToLookUp = max(interxn.getDistance(), min(x))
            distToLookUp = min(distToLookUp, max(x))
            i = min(bisect.bisect_left(splineX, distToLookUp),
                    len(splineX) - 1)
            prior_p = newSplineY[i] * (bias1 * bias2)
            p_val = scsp.bdtrc(interxn.getCount() - 1, observedIntraInRangeSum,
                               prior_p)
            intraInRangeCount += 1
        elif interactionType == 'intraShort' and not interOnly:
            prior_p = 1.0
            p_val = 1.0
            intraVeryProximalCount += 1
        elif interactionType == 'intraLong' and not interOnly:
            prior_p = 1.0
            #p_val=scsp.bdtrc(interxn.getCount()-1, observedIntraAllSum,prior_p) ##RUNBY
            p_val = 1.0
            intraOutOfRangeCount += 1
        else:
            if allReg or interOnly:
                prior_p = interChrProb * (bias1 * bias2)
                p_val = scsp.bdtrc(interxn.getCount() - 1, observedInterAllSum,
                                   prior_p)
                interCount += 1
            else:
                p_val = 1.0
                #p_vals.append(p_val)
        p_vals.append(p_val)
    infile.close()

    outlierThres = 0
    # Do the BH FDR correction
    if allReg:
        outlierThres = 1.0 / (possibleIntraInRangeCount +
                              possibleInterAllCount)
        q_vals = myStats.benjamini_hochberg_correction(
            p_vals, possibleInterAllCount + possibleIntraInRangeCount)
    elif interOnly and not allReg:
        outlierThres = 1.0 / possibleInterAllCount
        q_vals = myStats.benjamini_hochberg_correction(p_vals,
                                                       possibleInterAllCount)
    else:
        outlierThres = 1.0 / possibleIntraInRangeCount
        q_vals = myStats.benjamini_hochberg_correction(
            p_vals, possibleIntraInRangeCount)
    print("Outlier threshold is... %s" % (outlierThres))

    #now we write the values back to the file
    infile = gzip.open(infilename, 'rt')
    if resolution:
        outfile = gzip.open(
            outfilename + '.res' + str(resolution) + '.significances.txt.gz',
            'wt')
    else:
        outfile = gzip.open(outfilename + '.significances.txt.gz', 'wt')
    print("Writing p-values and q-values to file %s" %
          (outfilename + ".significances.txt"))
    outfile.write(
        "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\tbias1\tbias2\n"
    )
    count = 0
    for line in infile:
        words = line.rstrip().split()
        chr1 = words[0]
        midPoint1 = int(words[1])
        chr2 = words[2]
        midPoint2 = int(words[3])
        interactionCount = float(words[4])
        p_val = p_vals[count]
        q_val = q_vals[count]
        bias1 = biasl[count]
        bias2 = biasr[count]

        if (allReg or interOnly) and chr1 != chr2:
            outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" %
                          (str(chr1), midPoint1, str(chr2), midPoint2,
                           interactionCount, p_val, q_val, bias1, bias2))
        if (allReg or not interOnly) and chr1 == chr2:
            interactionDistance = abs(midPoint1 - midPoint2)
            if myUtils.in_range_check(interactionDistance, distLowThres,
                                      distUpThres):
                outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" %
                              (str(chr1), midPoint1, str(chr2), midPoint2,
                               interactionCount, p_val, q_val, bias1, bias2))

        if p_val < outlierThres:
            outliersline.add(count)
            outliersdist.add(abs(midPoint1 - midPoint2))
        count += 1
    outfile.close()
    infile.close()
    if visual == True:
        print("Plotting q-values to file %s" % outfilename + ".qplot.png")
    minFDR = 0.0
    maxFDR = 0.05
    increment = 0.001
    FDRx, FDRy = plot_qvalues(q_vals, minFDR, maxFDR, increment,
                              outfilename + ".qplot")

    with open(logfile, 'a') as log:
        log.write("Spline successfully fit\n"),
        log.write("\n"),
        log.write("\n"),

    return [
        splineX, newSplineY, residual, outliersline, outliersdist, FDRx, FDRy
    ]  # from fit_Spline
Beispiel #3
0
def read_All_Interactions(infilename, biasDic):
    sys.stderr.write(
        "\nReading all the interactions and then sorting the intra chr ones in range according to genomic distance\n"
    )
    sys.stderr.write(
        "------------------------------------------------------------------------------------\n"
    )

    # global variables initialized by this function
    global observedIntraAllSum
    global observedIntraAllCount
    global observedIntraInRangeSum
    global observedIntraInRangeCount
    global observedInterAllSum
    global observedInterAllCount
    global minObservedGenomicDist
    global maxObservedGenomicDist

    #read the interactions file - create a two dimensional numpy array with each row is a [distance,count] pair
    infile = gzip.open(infilename, 'r')
    for line in infile:
        words = line.rstrip().split()
        interxn = myUtils.Interaction(
            [words[0], int(words[1]), words[2],
             int(words[3])])
        interxn.setCount(int(words[4]))
        chrIndex1 = chrList.index(interxn.chr1)
        chrIndex2 = chrList.index(interxn.chr2)
        chr1 = words[0]
        chr2 = words[2]
        midPoint1 = int(words[1])
        midPoint2 = int(words[3])

        bias1 = 1.0
        bias2 = 1.0
        # assumes there is no bias to begin with
        # if the biasDic is not null sets the real bias values
        if len(biasDic) > 0:
            if chr1 in biasDic and midPoint1 in biasDic[chr1]:
                bias1 = biasDic[chr1][midPoint1]
            if chr2 in biasDic and midPoint2 in biasDic[chr2]:
                bias2 = biasDic[chr2][midPoint2]

        if interxn.mid1 not in listOfMappableFrags[
                chrIndex1] or interxn.mid2 not in listOfMappableFrags[
                    chrIndex2]:
            sys.stderr.write("Not-mappable fragment pair: %s\t" %
                             str(interxn.chr1) + "%d\t" % interxn.mid1 +
                             "%s\t" % str(interxn.chr2) +
                             "%d\n" % interxn.mid2)
            continue

        if interxn.type == 'inter':
            observedInterAllSum += interxn.hitCount
            observedInterAllCount += 1
        else:  # any type of intra
            observedIntraAllSum += interxn.hitCount
            observedIntraAllCount += 1
            if interxn.getType(distLowThres, distUpThres) == 'intraInRange':
                minObservedGenomicDist = min(minObservedGenomicDist,
                                             interxn.distance)
                maxObservedGenomicDist = max(maxObservedGenomicDist,
                                             interxn.distance)
                # every pair should already be in the dictionary with a zero interaction count
                dictkey = str(interxn.chr1) + '-' + str(
                    min(interxn.mid1, interxn.mid2)) + '-' + str(
                        max(interxn.mid1, interxn.mid2))
                if not dictkey in possiblePairsPerDistance:
                    sys.exit("Illegal fragment pair")
                else:
                    possiblePairsPerDistance[dictkey] = [
                        interxn.distance, interxn.hitCount, bias1 * bias2
                    ]  #--now with biases
                observedIntraInRangeSum += interxn.hitCount
                observedIntraInRangeCount += 1
        # END else

    # END for
    infile.close()
    sys.stderr.write("Total of \t"+str(observedIntraAllCount) +" observed intra-chr fragment pairs,\t"\
     +str(observedIntraInRangeCount) +" observed intra-chr fragment pairs in range,\t"\
     +str(observedInterAllCount) +" observed inter-chr fragment pairs\n" )
    sys.stderr.write("Total of \t"+str(observedIntraAllSum) +" observed intra-chr read counts,\t"\
     +str(observedIntraInRangeSum) +" observed intra-chr read counts in range,\t"\
     +str(observedInterAllSum) +" observed inter-chr read counts\n" )
    sys.stderr.write("Range of observed genomic distances	[%d	%d]" %
                     (minObservedGenomicDist, maxObservedGenomicDist) + "\n")

    # sort the interactions if not already sorted
    sortedInteractions = []
    for i in possiblePairsPerDistance:
        sortedInteractions.append(possiblePairsPerDistance.get(i))

    t = time.time()
    myUtils.sort_by_column(
        sortedInteractions,
        0)  #in-place sorting according to column index 0 (first column)
    sys.stderr.write(
        "Total time for sorting interactions according to genomic distance: %.3f\n"
        % (time.time() - t))

    return sortedInteractions  #from read_All_Interactions
Beispiel #4
0
def fit_Spline(x, y, yerr, infilename, sortedInteractions, biasDic, figname,
               passNo):
    sys.stderr.write("\nFit a univariate spline to the probability means\n")
    sys.stderr.write(
        "------------------------------------------------------------------------------------\n"
    )
    sys.stderr.write("baseline intra-chr probability: " +
                     repr(baselineIntraChrProb) +
                     "\tbaseline inter-chr probability: " +
                     repr(baselineInterChrProb) + "\n")
    # xi and yi will be used only for visualization purposes
    # acutal fit and residual is all done on vectors x and y
    xi = np.linspace(min(x), max(x), overSample * len(x))

    # assume residualFactor==-1:
    splineError = min(y) * min(y)

    # use fitpack2 method -fit on the real x and y from equal occupancy binning
    ius = UnivariateSpline(x, y, s=splineError)
    yi = ius(xi)

    #### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING
    ### NOW I DO THIS BY CALLING AN R function CALLED MONOREG
    ### This does the isotonic regression using option antitonic to make sure
    ### I get monotonically decreasing probabilites with increasion genomic distance

    tempMaxX = max(x)
    tempMinX = min(x)
    tempList = sorted(list(set([int(i[0]) for i in sortedInteractions])))
    splineX = []
    ### The below for loop will make sure nothing is out of range of [min(x) max(x)]
    ### Therefore everything will be within the range where the spline is defined
    for i in tempList:
        if tempMinX <= i and i <= tempMaxX:
            splineX.append(i)
    # END for
    #print len(splineX)
    splineY = ius(splineX)

    # R vector format
    rSplineX = ro.FloatVector(splineX)
    rSplineY = ro.FloatVector(splineY)
    rMonoReg = ro.r['monoreg']
    # do the antitonic regression
    allRres = rMonoReg(rSplineX, rSplineY, type="antitonic")
    rNewSplineY = allRres[3]
    # convert data back to Python format
    newSplineY = []
    diff = []
    diffX = []
    for i in range(len(rNewSplineY)):
        newSplineY.append(rNewSplineY[i])
        if (splineY[i] - newSplineY[i]) > 0:
            diff.append(splineY[i] - newSplineY[i])
            diffX.append(splineX[i])
    # END for
    #print len(splineX)

    residual = sum([i * i for i in (y - ius(x))])

    if visual == True:
        ### Now plot the results
        sys.stderr.write("Plotting %s" % figname + ".png\n")
        plt.clf()
        fig = plt.figure()
        ax = fig.add_subplot(2, 1, 1)
        plt.plot(myUtils.scale_a_list(splineX, toKb),
                 myUtils.scale_a_list(newSplineY, toProb),
                 'g-',
                 label="spline-" + str(passNo),
                 linewidth=2)
        plt.errorbar(myUtils.scale_a_list(x, toKb),
                     myUtils.scale_a_list(y, toProb),
                     myUtils.scale_a_list(yerr, toProb),
                     fmt='r.',
                     label="Mean with std. error",
                     linewidth=2)

        if useInters:
            plt.plot(myUtils.scale_a_list(x, toKb),
                     myUtils.scale_a_list([baselineIntraChrProb for i in x],
                                          toProb),
                     'k-',
                     label="Baseline intra-chromosomal")
            plt.plot(myUtils.scale_a_list(x, toKb),
                     myUtils.scale_a_list([baselineIntraChrProb for i in x],
                                          toProb),
                     'b-',
                     label="Baseline inter-chromosomal")
        plt.ylabel('Contact probability (x10$^{-5}$)', fontsize='large')
        plt.xlabel('Genomic distance (kb)', fontsize='large')
        if distLowThres > -1 and distUpThres > -1:
            plt.xlim(myUtils.scale_a_list([distLowThres, distUpThres], toKb))
        plt.gca().yaxis.set_major_locator(MaxNLocator(nbins=3, prune=None))
        ax.legend(loc="upper right")

        ax = fig.add_subplot(2, 1, 2)

        plt.loglog(splineX, newSplineY, 'g-')
        plt.errorbar(x, y, yerr=yerr, fmt='r.')  # Data
        if useInters:
            plt.loglog(x, [baselineIntraChrProb for i in x], 'k-')
            plt.loglog(x, [baselineIntraChrProb for i in x], 'b-')
        if distLowThres > -1 and distUpThres > -1:
            plt.xlim([distLowThres, distUpThres])
        plt.ylabel('Contact probability (log-scale)', fontsize='large')
        plt.xlabel('Genomic distance (log-scale)', fontsize='large')

        plt.savefig(outdir + '/' + figname + '.png')

    # NOW write the calculated pvalues and corrected pvalues in a file
    infile = gzip.open(infilename, 'r')
    intraInRangeCount = 0
    intraOutOfRangeCount = 0
    intraVeryProximalCount = 0
    interCount = 0
    sys.stderr.write("distLowThres " + repr(distLowThres) + "\tdistUpThres " +
                     repr(distUpThres) + "\n")
    p_vals = []
    q_vals = []
    for line in infile:
        words = line.rstrip().split()
        interxn = myUtils.Interaction(
            [words[0], int(words[1]), words[2],
             int(words[3])])
        interxn.setCount(int(words[4]))
        chr1 = words[0]
        chr2 = words[2]
        midPoint1 = int(words[1])
        midPoint2 = int(words[3])

        bias1 = 1.0
        bias2 = 1.0
        # assumes there is no bias to begin with
        # if the biasDic is not null sets the real bias values
        if len(biasDic) > 0:
            if chr1 in biasDic and midPoint1 in biasDic[chr1]:
                bias1 = biasDic[chr1][midPoint1]
            if chr2 in biasDic and midPoint2 in biasDic[chr2]:
                bias2 = biasDic[chr2][midPoint2]

        if (bias1 < 0 or bias2 < 0) and interxn.type != 'inter':
            prior_p = 1.0
            p_val = 1.0
            p_vals.append(p_val)
        elif interxn.getType(distLowThres, distUpThres) == 'intraInRange':
            # make sure the interaction distance is covered by the probability bins
            distToLookUp = max(interxn.distance, min(x))
            distToLookUp = min(distToLookUp, max(x))
            i = min(bisect.bisect_left(splineX, distToLookUp),
                    len(splineX) - 1)
            #prior_p=newSplineY[i]
            prior_p = newSplineY[i] * (bias1 * bias2
                                       )  # biases added in the picture
            intraInRangeCount += 1
            ############# THIS HAS TO BE interactionCount-1 ##################
            p_val = scsp.bdtrc(interxn.hitCount - 1, observedIntraInRangeSum,
                               prior_p)
            p_vals.append(p_val)

        elif interxn.getType(distLowThres, distUpThres) == 'intraShort':
            prior_p = 1.0
            p_val = 1.0
            intraVeryProximalCount += 1
            p_vals.append(p_val)

        elif interxn.getType(distLowThres, distUpThres) == 'intraLong':
            # out of range bigger than distUpThres
            # use the prior of the baseline intra-chr interaction probability
            prior_p = 1.0  #baselineIntraChrProb*(bias1*bias2)  # biases added in the picture
            p_val = scsp.bdtrc(interxn.hitCount - 1, observedIntraAllSum,
                               prior_p)
            intraOutOfRangeCount += 1
            p_vals.append(p_val)

        else:
            if useInters:
                #prior_p=baselineIntraChrProb
                prior_p = baselineInterChrProb * (
                    bias1 * bias2)  # biases added in the picture
                ############# THIS HAS TO BE interactionCount-1 ##################
                p_val = scsp.bdtrc(interxn.hitCount - 1, observedInterAllSum,
                                   prior_p)
                interCount += 1
                p_vals.append(p_val)
    # END for
    infile.close()

    # Do the BH FDR correction
    if useInters:
        q_vals = myStats.benjamini_hochberg_correction(
            p_vals, possibleInterAllCount + possibleIntraAllCount)
        sys.stderr.write("possibleInterAllCount+possibleIntraAllCount " +
                         repr(possibleInterAllCount + possibleIntraAllCount) +
                         "\n")
    else:
        q_vals = myStats.benjamini_hochberg_correction(
            p_vals, possibleIntraInRangeCount)
        sys.stderr.write("possibleIntraInRangeCount " +
                         repr(possibleIntraInRangeCount) + "\n")

    infile = gzip.open(infilename, 'r')
    outfile = gzip.open(outdir + '/' + figname + '.significances.txt.gz', 'w')
    sys.stderr.write("Writing p-values to file %s" % figname +
                     ".significances.txt.gz\n")
    count = 0
    outfile.write(
        "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n"
    )

    for line in infile:
        words = line.rstrip().split()
        chrNo1 = words[0]
        midPoint1 = int(words[1])
        chrNo2 = words[2]
        midPoint2 = int(words[3])
        interactionCount = int(words[4])
        p_val = p_vals[count]
        q_val = q_vals[count]

        if useInters == False and chrNo1 == chrNo2:  # intra
            interactionDistance = abs(midPoint1 - midPoint2)  # dist
            if myUtils.in_range_check(interactionDistance, distLowThres,
                                      distUpThres):
                outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" %
                              (str(chrNo1), midPoint1, str(chrNo2), midPoint2,
                               interactionCount, p_val, q_val))
        elif useInters == True and chrNo1 != chrNo2:
            outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" %
                          (str(chrNo1), midPoint1, str(chrNo2), midPoint2,
                           interactionCount, p_val, q_val))
        #outfile.write("ALL\t%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val))

        count += 1
    # END for - printing pvals and qvals for all the interactions
    outfile.close()

    isOutlier = []
    distsBelow = []
    distsAbove = []
    intcountsBelow = []
    intcountsAbove = []
    belowThresCount = 0
    aboveThresCount = 0
    outlierThres = 1.0 / possibleIntraInRangeCount
    for interactionDistance, interactionCount, bias12 in sortedInteractions:
        # make sure the interaction distance is covered by the probability bins
        distToLookUp = max(interactionDistance, min(x))
        distToLookUp = min(distToLookUp, max(x))
        i = min(bisect.bisect_left(splineX, distToLookUp), len(splineX) - 1)
        prior_p = newSplineY[i] * float(bias12)  # biases added in the picture
        ############# THIS HAS TO BE interactionCount-1 ##################
        p_val = scsp.bdtrc(interactionCount - 1, observedIntraInRangeSum,
                           prior_p)
        if p_val < outlierThres:
            distsBelow.append(interactionDistance)
            intcountsBelow.append(interactionCount)
            isOutlier.append(1)
            belowThresCount += 1
        else:
            distsAbove.append(interactionDistance)
            intcountsAbove.append(interactionCount)
            isOutlier.append(0)
            aboveThresCount += 1
    # END for - doing the outlier check for all interactions in sortedInteractions

    if visual == True:
        sys.stderr.write("Plotting results of extracting outliers to file %s" %
                         figname + ".extractOutliers.png\n")
        plt.clf()
        fig = plt.figure()
        ax = fig.add_subplot(111)
        downsample = 30  # for the non-outliers
        randIndcsAbove = sample([i for i in range(len(intcountsAbove))],
                                len(intcountsAbove) / downsample)
        randIndcsAbove = sorted(randIndcsAbove)
        downsample = 20  # for the outliers
        randIndcsBelow = sample([i for i in range(len(intcountsBelow))],
                                len(intcountsBelow) / downsample)
        randIndcsBelow = sorted(randIndcsBelow)

        plt.plot(myUtils.scale_a_list([distsBelow[i] for i in randIndcsBelow],
                                      toKb),
                 [intcountsBelow[i] for i in randIndcsBelow],
                 'r.',
                 label="Outliers (p-value < 1/M)")
        plt.plot(myUtils.scale_a_list(splineX + [maxObservedGenomicDist],
                                      toKb),
                 [
                     newSplineY[i] * observedIntraInRangeSum
                     for i in range(len(newSplineY))
                 ] + [newSplineY[-1] * observedIntraInRangeSum],
                 'g-',
                 label="spline-" + str(passNo) + " (x N)",
                 linewidth=2.5)

        plt.xlabel('Genomic distance (kb)')
        plt.ylabel('Contact counts')
        print(repr(len(intcountsBelow)) + "\t"),
        ## this limits y-axis of the hit count plots
        if len(intcountsBelow) > 0:
            plt.ylim([0, min(max(intcountsBelow), 1500)])
        if distLowThres > -1 and distUpThres > -1:
            plt.xlim([0, distUpThres * toKb])
        ax.legend(loc="upper right", fancybox=True)
        plt.savefig(outdir + '/' + figname + '.extractOutliers.png')

    sys.stderr.write("intraInRangeCount " + repr(intraInRangeCount)+"\tintraOutOfRangeCount " +\
     repr(intraOutOfRangeCount)+"\tintraVeryProximalCount " + repr(intraVeryProximalCount) +"\tinterCount " + repr(interCount)+"\n")

    if visual == True:
        sys.stderr.write("Plotting q-values to file %s" % figname +
                         ".qplot.png\n")
    minFDR = 0.0
    maxFDR = 0.05
    increment = 0.001
    FDRx, FDRy = plot_qvalues(q_vals, minFDR, maxFDR, increment,
                              figname + ".qplot")

    infile.close()

    return [splineX, newSplineY, residual, isOutlier, FDRx,
            FDRy]  # from fit_Spline
def fit_Spline(mainDic, x, y, yerr, infilename, outfilename, biasDic):
    print("\nFit a univariate spline to the probability means\n"),
    print(
        "------------------------------------------------------------------------------------\n"
    ),
    #print("baseline intra-chr probability: " + repr(baselineIntraChrProb)+ "\n"),

    # maximum residual allowed for spline is set to min(y)^2
    splineError = min(y) * min(y)

    # use fitpack2 method -fit on the real x and y from equal occupancy binning
    ius = UnivariateSpline(x, y, s=splineError)

    #### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING
    ### NOW I DO THIS BY CALLING AN R function CALLED MONOREG
    ### This does the isotonic regression using option antitonic to make sure
    ### I get monotonically decreasing probabilites with increasion genomic distance

    tempMaxX = max(x)
    tempMinX = min(x)
    tempList = sorted([dis for dis in mainDic])
    splineX = []
    ### The below for loop will make sure nothing is out of range of [min(x) max(x)]
    ### Therefore everything will be within the range where the spline is defined
    for i in tempList:
        if tempMinX <= i and i <= tempMaxX:
            splineX.append(i)
    # END for
    splineY = ius(splineX)

    # R vector format
    rSplineX = ro.FloatVector(splineX)
    rSplineY = ro.FloatVector(splineY)
    rMonoReg = ro.r['monoreg']
    # do the antitonic regression
    allRres = rMonoReg(rSplineX, rSplineY, type="antitonic")
    rNewSplineY = allRres[3]
    # convert data back to Python format
    newSplineY = []
    diff = []
    diffX = []
    for i in range(len(rNewSplineY)):
        newSplineY.append(rNewSplineY[i])
        if (splineY[i] - newSplineY[i]) > 0:
            diff.append(splineY[i] - newSplineY[i])
            diffX.append(splineX[i])
        # END if
    # END for

    ### Now newSplineY holds the monotonic contact probabilities
    residual = sum([i * i for i in (y - ius(x))])

    ### Now plot the results
    plt.clf()
    fig = plt.figure()
    ax = fig.add_subplot(2, 1, 1)
    plt.title(
        'Univariate spline fit to the output of equal occupancy binning. \n Residual= %e'
        % (residual),
        size='small')
    plt.plot([i / 1000.0 for i in x], [i * 100000 for i in y],
             'ro',
             label="Means")
    #plt.plot([i/1000.0 for i in xi], [i*100000 for i in yi],'g-',label="Spline fit")
    plt.plot([i / 1000.0 for i in splineX], [i * 100000 for i in newSplineY],
             'g-',
             label="Spline fit")
    #plt.plot([i/1000.0 for i in x], [normalizedInterChrProb*100000 for i in x],'k-',label="Random intra-chromosomal")
    #plt.plot([i/1000.0 for i in x], [interChrProb*100000 for i in x],'b-',label="Inter-chromosomal")
    plt.ylabel('Probability (1e-5)')
    plt.xlabel('Genomic distance (kb)')
    plt.xlim([min(x) / 1000.0, max(x) / 1000.0])
    ax.legend(loc="upper right")

    ax = fig.add_subplot(2, 1, 2)
    plt.loglog(splineX, newSplineY, 'g-')
    #plt.loglog(xi, yi, 'g-')
    plt.loglog(x, y, 'r.')  # Data
    #plt.loglog(x, [normalizedInterChrProb for i in x],'k-')
    #plt.loglog(x, [interChrProb for i in x],'b-')
    plt.ylabel('Probability (log scale)')
    plt.xlabel('Genomic distance (log scale)')
    #plt.xlim([20000,100000])
    plt.xlim([min(x), max(x)])
    plt.savefig(outfilename + '.res' + str(resolution) + '.png')
    sys.stderr.write("Plotting %s" % outfilename + ".png\n")

    # NOW write the calculated pvalues and corrected pvalues in a file
    infile = gzip.open(infilename, 'r')
    intraInRangeCount = 0
    intraOutOfRangeCount = 0
    intraVeryProximalCount = 0
    interCount = 0
    discardCount = 0
    print("lower bound on mid-range distances  " + repr(distLowThres) +
          ", upper bound on mid-range distances  " + repr(distUpThres) + "\n"),
    p_vals = []
    q_vals = []
    for line in infile:
        words = line.rstrip().split()
        interxn = myUtils.Interaction(
            [words[0], int(words[1]), words[2],
             int(words[3])])
        interxn.setCount(float(words[4]))
        chr1 = words[0]
        chr2 = words[2]
        midPoint1 = int(words[1])
        midPoint2 = int(words[3])

        bias1 = 1.0
        bias2 = 1.0
        # assumes there is no bias to begin with
        # if the biasDic is not null sets the real bias values
        if len(biasDic) > 0:
            if chr1 in biasDic and midPoint1 in biasDic[chr1]:
                bias1 = biasDic[chr1][midPoint1]
            if chr2 in biasDic and midPoint2 in biasDic[chr2]:
                bias2 = biasDic[chr2][midPoint2]

        if bias1 == -1 or bias2 == -1:
            p_val = 1.0
            discardCount += 1
        elif interxn.type == 'intra':
            if interxn.getType(distLowThres, distUpThres) == 'intraInRange':
                # make sure the interaction distance is covered by the probability bins
                distToLookUp = max(interxn.distance, min(x))
                distToLookUp = min(distToLookUp, max(x))
                i = min(bisect.bisect_left(splineX, distToLookUp),
                        len(splineX) - 1)
                prior_p = newSplineY[i] * (bias1 * bias2
                                           )  # biases added in the picture
                p_val = scsp.bdtrc(interxn.hitCount - 1,
                                   observedIntraInRangeSum, prior_p)
                intraInRangeCount += 1
            elif interxn.getType(distLowThres, distUpThres) == 'intraShort':
                prior_p = 1.0
                p_val = 1.0
                intraVeryProximalCount += 1
            elif interxn.getType(distLowThres, distUpThres) == 'intraLong':
                ## out of range distance
                ## use the prior of the baseline intra-chr interaction probability
                prior_p = baselineIntraChrProb * (
                    bias1 * bias2)  # biases added in the picture
                p_val = scsp.bdtrc(interxn.hitCount - 1, observedIntraAllSum,
                                   prior_p)
                intraOutOfRangeCount += 1
            # END if
        else:  # inter
            #prior_p=normalizedInterChrProb
            prior_p = interChrProb * (bias1 * bias2
                                      )  # biases added in the picture
            ############# THIS HAS TO BE interactionCount-1 ##################
            p_val = scsp.bdtrc(interxn.hitCount - 1, observedInterAllSum,
                               prior_p)
            interCount += 1
        #
        p_vals.append(p_val)

    # END for
    infile.close()

    # Do the BH FDR correction
    q_vals = myStats.benjamini_hochberg_correction(
        p_vals, possibleInterAllCount + possibleIntraAllCount)
    #q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleIntraInRangeCount)
    #print("possibleIntraInRangeCount " + repr(possibleIntraInRangeCount)+"\n"),

    infile = gzip.open(infilename, 'r')
    outfile = gzip.open(
        outfilename + '.res' + str(resolution) + '.significances.txt.gz', 'w')
    print("Writing p-values and q-values to file %s" % outfilename +
          ".significances.txt\n"),
    print("Number of pairs discarded due to bias not in range [0.5 2]\n"),
    outfile.write(
        "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n"
    )
    count = 0
    for line in infile:
        words = line.rstrip().split()
        chrNo1 = words[0]
        midPoint1 = int(words[1])
        chrNo2 = words[2]
        midPoint2 = int(words[3])
        interactionCount = int(words[4])
        p_val = p_vals[count]
        q_val = q_vals[count]
        #if chrNo1==chrNo2: # intra
        #	interactionDistance=abs(midPoint1-midPoint2) # dist
        #	if myUtils.in_range_check(interactionDistance,distLowThres,distUpThres):
        #		outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val))
        #else:
        #	outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val))

        outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" %
                      (str(chrNo1), midPoint1, str(chrNo2), midPoint2,
                       interactionCount, p_val, q_val))
        count += 1
    # END for - printing pvals and qvals for all the interactions
    outfile.close()
    infile.close()
    return [splineX, newSplineY, residual]  # from fit_Spline
def read_All_Interactions(mainDic, contactCountsFile, noOfFrags):
    print("\nReading all the contact counts\n"),
    print(
        "------------------------------------------------------------------------------------\n"
    ),

    global observedInterAllSum
    global observedInterAllCount
    global observedIntraAllSum
    global observedIntraAllCount
    global observedIntraInRangeSum
    global observedIntraInRangeCount
    global minObservedGenomicDist
    global maxObservedGenomicDist

    #Xvals=[]
    #Xindices=[]
    #for i in range(noOfFrags):
    #	Xvals.append([])
    #	Xindices.append([])
    ##
    infile = gzip.open(contactCountsFile, 'r')
    count = 0
    for line in infile:
        ch1, mid1, ch2, mid2, contactCount = line.split()
        ### FIXME: this part will need to be fixed for human etc
        #ch1='chr'+ch1
        #ch2='chr'+ch2
        contactCount = float(contactCount)
        interxn = myUtils.Interaction([ch1, int(mid1), ch2, int(mid2)])
        interxn.setCount(contactCount)
        count += 1

        if count % 1000000 == 0:
            print count
        if interxn.type == 'inter':
            observedInterAllSum += interxn.hitCount
            observedInterAllCount += 1
        else:  # any type of intra
            observedIntraAllSum += interxn.hitCount
            observedIntraAllCount += 1
            if interxn.getType(distLowThres, distUpThres) == 'intraInRange':
                minObservedGenomicDist = min(minObservedGenomicDist,
                                             interxn.distance)
                maxObservedGenomicDist = max(maxObservedGenomicDist,
                                             interxn.distance)
                if interxn.distance in mainDic:
                    mainDic[interxn.distance][1] += contactCount
                observedIntraInRangeSum += interxn.hitCount
                observedIntraInRangeCount += 1
        # END else
    #	indx1=allFragsDic[ch1][mid1]
    #	indx2=allFragsDic[ch2][mid2]
    #print str(indx1)+"\t"+str(indx2)
    #	Xvals[indx1].append(contactCount)
    #	Xindices[indx1].append(indx2)
    #	Xvals[indx2].append(contactCount)
    #	Xindices[indx2].append(indx1)
    # END for
    infile.close()
    print("Observed, Intra-chr in range: pairs= " +
          str(observedIntraInRangeCount) + "\t totalCount= " +
          str(observedIntraInRangeSum))
    print("Observed, Intra-chr all: pairs= " + str(observedIntraAllCount) +
          "\t totalCount= " + str(observedIntraAllSum))
    print("Observed, Inter-chr all: pairs= " + str(observedInterAllCount) +
          "\t totalCount= " + str(observedInterAllSum))
    print(
        "Range of observed genomic distances [%d %d]" %
        (minObservedGenomicDist, maxObservedGenomicDist) + "\n"),

    #return (mainDic,Xvals,Xindices) # from read_All_Interactions
    return mainDic  # from read_All_Interactions