Example #1
0
def fit_Spline(mainDic, x, y, yerr, infilename, outfilename, biasDic,
               outliersline, outliersdist, observedIntraInRangeSum,
               possibleIntraInRangeCount, possibleInterAllCount,
               observedIntraAllSum, observedInterAllSum, resolution, passNo):
    with open(logfile, 'a') as log:
        log.write("\nFitting a univariate spline to the probability means\n"),
        log.write(
            "------------------------------------------------------------------------------------\n"
        ),

    splineX = None
    newSplineY = None
    residual = None
    FDRx = None
    FDRy = None

    if not interOnly:
        if outliersdist != None:
            y = [f for _, f in sorted(zip(x, y), key=lambda pair: pair[0])]
            x.sort()
        for i in range(1, len(x)):
            if x[i] <= x[i - 1]:
                print(
                    "ERROR in spline fitting. Distances do not decrease across bins. Ensure interaction file is correct."
                )
                print("Avg. distance of bin(i-1)... %s" % x[i - 1])
                print("Avg. distance of bin(i)... %s" % x[i])
                sys.exit(2)

        # maximum residual allowed for spline is set to min(y)^2
        splineError = min(y) * min(y)

        # use fitpack2 method -fit on the real x and y from equal occupancy binning
        ius = UnivariateSpline(x, y, s=splineError)
        tempMaxX = max(x)
        tempMinX = min(x)
        tempList = sorted([dis for dis in mainDic])
        splineX = []
        ### The below for loop will make sure nothing is out of range of [min(x) max(x)]
        ### Therefore everything will be within the range where the spline is defined
        for i in tempList:
            if tempMinX <= i <= tempMaxX:
                splineX.append(i)
        splineY = ius(splineX)
        #print(splineY)
        #print(yerr)

        ir = IsotonicRegression(increasing=False)
        newSplineY = ir.fit_transform(splineX, splineY)
        #print(newSplineY)
        residual = sum([i * i for i in (y - ius(x))])

        if visual == True:
            xi = np.linspace(min(x), max(x), 5 * len(x))
            yi = ius(xi)

            print("Plotting %s" % (outfilename + ".png"))
            plt.clf()
            fig = plt.figure()
            ax = fig.add_subplot(2, 1, 1)
            plt.plot(myUtils.scale_a_list(splineX, toKb),
                     myUtils.scale_a_list(newSplineY, toProb),
                     'g-',
                     label="spline-" + str(passNo),
                     linewidth=2)
            plt.errorbar(myUtils.scale_a_list(x, toKb),
                         myUtils.scale_a_list(y, toProb),
                         myUtils.scale_a_list(yerr, toProb),
                         fmt='r.',
                         label="Mean with std. error",
                         linewidth=2)

            #plt.ylabel('Contact probability (x10$^{-5}$)',fontsize='large')
            #plt.xlabel('Genomic distance (kb)',fontsize='large')
            plt.ylabel('Contact probability (x10$^{-5}$)')
            plt.xlabel('Genomic distance (kb)')
            if distLowThres > 0 and distUpThres < float("inf"):
                plt.xlim(
                    myUtils.scale_a_list([distLowThres, distUpThres], toKb))
            plt.gca().yaxis.set_major_locator(MaxNLocator(nbins=3, prune=None))
            ax.legend(loc="upper right")

            ax = fig.add_subplot(2, 1, 2)

            plt.loglog(splineX, newSplineY, 'g-')
            plt.errorbar(x, y, yerr=yerr, fmt='r.')  # Data
            if distLowThres > 0 and distUpThres < float("inf"):
                plt.xlim([distLowThres, distUpThres])
            plt.ylabel('Contact probability (log-scale)')
            plt.xlabel('Genomic distance (log-scale)')

            plt.savefig(outfilename + '.png')

    # NOW write the calculated pvalues and corrected pvalues in a file
    infile = gzip.open(infilename, 'rt')
    intraInRangeCount = 0
    intraOutOfRangeCount = 0
    intraVeryProximalCount = 0
    interCount = 0
    discardCount = 0
    p_vals = []
    q_vals = []
    biasl = []
    biasr = []
    for line in infile:
        ch1, mid1, ch2, mid2, contactCount = line.rstrip().split()
        contactCount = float(contactCount)
        interxn = myUtils.Interaction([ch1, int(mid1), ch2, int(mid2)])
        interxn.setCount(contactCount)
        mid1 = int(mid1)
        mid2 = int(mid2)
        interactionType = interxn.getType(distLowThres, distUpThres)
        bias1 = 1.0
        bias2 = 1.0
        # assumes there is no bias to begin with
        # if the biasDic is not null sets the real bias values
        if biasDic:
            if ch1 in biasDic and mid1 in biasDic[ch1]:
                bias1 = biasDic[ch1][mid1]
            if ch2 in biasDic and mid2 in biasDic[ch2]:
                bias2 = biasDic[ch2][mid2]
        biasl.append(bias1)
        biasr.append(bias2)
        if (bias1 < 0 or bias2 < 0) and interactionType != 'inter':
            prior_p = 1.0
            p_val = 1.0
            discardCount += 1
        elif interactionType == 'intraInRange' and not interOnly:
            distToLookUp = max(interxn.getDistance(), min(x))
            distToLookUp = min(distToLookUp, max(x))
            i = min(bisect.bisect_left(splineX, distToLookUp),
                    len(splineX) - 1)
            prior_p = newSplineY[i] * (bias1 * bias2)
            p_val = scsp.bdtrc(interxn.getCount() - 1, observedIntraInRangeSum,
                               prior_p)
            intraInRangeCount += 1
        elif interactionType == 'intraShort' and not interOnly:
            prior_p = 1.0
            p_val = 1.0
            intraVeryProximalCount += 1
        elif interactionType == 'intraLong' and not interOnly:
            prior_p = 1.0
            #p_val=scsp.bdtrc(interxn.getCount()-1, observedIntraAllSum,prior_p) ##RUNBY
            p_val = 1.0
            intraOutOfRangeCount += 1
        else:
            if allReg or interOnly:
                prior_p = interChrProb * (bias1 * bias2)
                p_val = scsp.bdtrc(interxn.getCount() - 1, observedInterAllSum,
                                   prior_p)
                interCount += 1
            else:
                p_val = 1.0
                #p_vals.append(p_val)
        p_vals.append(p_val)
    infile.close()

    outlierThres = 0
    # Do the BH FDR correction
    if allReg:
        outlierThres = 1.0 / (possibleIntraInRangeCount +
                              possibleInterAllCount)
        q_vals = myStats.benjamini_hochberg_correction(
            p_vals, possibleInterAllCount + possibleIntraInRangeCount)
    elif interOnly and not allReg:
        outlierThres = 1.0 / possibleInterAllCount
        q_vals = myStats.benjamini_hochberg_correction(p_vals,
                                                       possibleInterAllCount)
    else:
        outlierThres = 1.0 / possibleIntraInRangeCount
        q_vals = myStats.benjamini_hochberg_correction(
            p_vals, possibleIntraInRangeCount)
    print("Outlier threshold is... %s" % (outlierThres))

    #now we write the values back to the file
    infile = gzip.open(infilename, 'rt')
    if resolution:
        outfile = gzip.open(
            outfilename + '.res' + str(resolution) + '.significances.txt.gz',
            'wt')
    else:
        outfile = gzip.open(outfilename + '.significances.txt.gz', 'wt')
    print("Writing p-values and q-values to file %s" %
          (outfilename + ".significances.txt"))
    outfile.write(
        "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\tbias1\tbias2\n"
    )
    count = 0
    for line in infile:
        words = line.rstrip().split()
        chr1 = words[0]
        midPoint1 = int(words[1])
        chr2 = words[2]
        midPoint2 = int(words[3])
        interactionCount = float(words[4])
        p_val = p_vals[count]
        q_val = q_vals[count]
        bias1 = biasl[count]
        bias2 = biasr[count]

        if (allReg or interOnly) and chr1 != chr2:
            outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" %
                          (str(chr1), midPoint1, str(chr2), midPoint2,
                           interactionCount, p_val, q_val, bias1, bias2))
        if (allReg or not interOnly) and chr1 == chr2:
            interactionDistance = abs(midPoint1 - midPoint2)
            if myUtils.in_range_check(interactionDistance, distLowThres,
                                      distUpThres):
                outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" %
                              (str(chr1), midPoint1, str(chr2), midPoint2,
                               interactionCount, p_val, q_val, bias1, bias2))

        if p_val < outlierThres:
            outliersline.add(count)
            outliersdist.add(abs(midPoint1 - midPoint2))
        count += 1
    outfile.close()
    infile.close()
    if visual == True:
        print("Plotting q-values to file %s" % outfilename + ".qplot.png")
    minFDR = 0.0
    maxFDR = 0.05
    increment = 0.001
    FDRx, FDRy = plot_qvalues(q_vals, minFDR, maxFDR, increment,
                              outfilename + ".qplot")

    with open(logfile, 'a') as log:
        log.write("Spline successfully fit\n"),
        log.write("\n"),
        log.write("\n"),

    return [
        splineX, newSplineY, residual, outliersline, outliersdist, FDRx, FDRy
    ]  # from fit_Spline
Example #2
0
def generate_FragPairs(infilename):
    sys.stderr.write(
        "\nGenerating all possible intra-chromosomal fragment pairs and counting the number of all possible inter-chr fragment pairs\n"
    )
    sys.stderr.write(
        "------------------------------------------------------------------------------------\n"
    )
    global listOfMappableFrags  # two dimensional list with all mappable fragment midpoints for each chr
    global chrList  # list of all chromosomes (chrno (type=int))
    global possiblePairsPerDistance  # all possible intra-chr fragment pairs
    global possibleInterAllCount  # count of all possible inter-chr fragment pairs
    global possibleIntraAllCount  # count of all possible intra-chr fragment pairs
    global possibleIntraInRangeCount  # count of all possible intra-chr fragment pairs in the range we're interested
    global baselineInterChrProb  # 1 divided by all possible inter-chr fragment pairs
    global baselineIntraChrProb  #  1 divided by all possible intra-chr fragment pairs

    listOfMappableFrags = []
    chrList = []

    #get the name of the first chr
    infile = gzip.open(infilename, 'r')
    line = infile.readline()
    words = line.rstrip().split()
    currChrNo = words[0]  #get the name of first chr
    infile.close()

    # read the fragments file
    fragsPerChr = [
    ]  # temporary list that will be added to listOfMappableFrags for each chr
    totalNoOfFrags = 0  # total number of all mappable fragments
    infile = gzip.open(infilename, 'r')
    for line in infile:
        words = line.rstrip().split()
        chrNo = words[0]  # can be an integer or a string
        #words[1] ignored
        midPoint = int(words[2])
        hitCount = int(words[3])
        # whenever the name of the chromosome changes
        if currChrNo != chrNo:
            listOfMappableFrags.append(fragsPerChr)
            totalNoOfFrags += len(fragsPerChr)
            chrList.append(currChrNo)
            currChrNo = chrNo
            fragsPerChr = []
        # add the mappable midPoints to the temp fragsPerChr
        if hitCount >= mappabilityThreshold:
            fragsPerChr.append(midPoint)
    #END for

    # handle the last chromosome
    listOfMappableFrags.append(fragsPerChr)
    totalNoOfFrags += len(fragsPerChr)
    chrList.append(currChrNo)
    infile.close()

    # create all possible frag pairs
    possibleInterAllCount = 0
    possibleIntraInRangeCount = 0
    possibleIntraAllCount = 0
    for i in chrList:
        countIntraPairs = 0
        chrIndex = chrList.index(
            i)  # get the index of chromosome from the chrList
        fragsPerChr = (listOfMappableFrags[chrIndex]
                       )[:]  # get the mappable midpoints for that chr
        tempLen = len(fragsPerChr)
        possibleInterAllCount += (totalNoOfFrags - tempLen) * tempLen
        # iterate over all possible intra-chr pairs to see which ones qualify as a 'possible' pair
        for x in range(tempLen):
            for y in range(x + 1, tempLen):
                interactionDistance = abs(fragsPerChr[x] - fragsPerChr[y])
                if myUtils.in_range_check(interactionDistance, distLowThres,
                                          distUpThres):
                    countIntraPairs += 1
                    dictkey = str(i) + '-' + str(
                        min(fragsPerChr[x], fragsPerChr[y])) + '-' + str(
                            max(fragsPerChr[x], fragsPerChr[y]))
                    possiblePairsPerDistance[dictkey] = [
                        interactionDistance, 0, 1.0
                    ]  # set count to zero for now and bias to 1.0
                possibleIntraAllCount += 1
            #END for
        #END for
        possibleIntraInRangeCount += countIntraPairs
        sys.stderr.write("Chromosome " +repr(i) +",\t"+str(tempLen) +" mappable fragments, \t"+str(countIntraPairs)\
        +" possible intra-chr fragment pairs in range,\t" + str((totalNoOfFrags-tempLen)*tempLen) +" possible inter-chr fragment pairs\n")
    #END for

    # divide the possibleInterAllCount by 2 so that every inter-chr interaction is counted only once
    possibleInterAllCount = possibleInterAllCount / 2
    sys.stderr.write("Total of \t"+str(possibleIntraInRangeCount) +" possible intra-chr fragment pairs in range,\t"\
    +str(possibleIntraAllCount) +" possible intra-chr fragment pairs,\t"\
    +str(possibleInterAllCount) +" possible inter-chr fragment pairs\n")
    # calculate inter-chr probabilities
    if possibleInterAllCount > 0:
        baselineInterChrProb = 1.0 / possibleInterAllCount
    baselineIntraChrProb = 1.0 / possibleIntraAllCount

    return  # from generate_FragPairs
Example #3
0
def generate_FragPairs(binStats, fragsfile, resolution):
    if resolution:
        with open(logfile, 'a') as log:
            log.write("Looping through all possible fragment pairs in-range\n")
            log.write(
                "------------------------------------------------------------------------------------\n"
            ),
    else:
        with open(logfile, 'a') as log:
            log.write("Enumerating all possible fragment pairs in-range\n")
            log.write(
                "------------------------------------------------------------------------------------\n"
            ),
    startT = time.time()

    minPossibleGenomicDist = float("inf")
    maxPossibleGenomicDist = 0
    possibleIntraAllCount = 0
    possibleInterAllCount = 0
    possibleIntraInRangeCount = 0
    interChrProb = 0
    baselineIntraChrProb = 0

    allFragsDic = {}
    with gzip.open(fragsfile, 'rt') as infile:
        for line in infile:
            words = line.split()
            currChr = words[0]
            currMid = int(words[2])
            currHit = int(words[3])
            if currChr not in allFragsDic:
                allFragsDic[currChr] = []
            if currHit >= mappThres:
                allFragsDic[currChr].append(currMid)

    if resolution:
        noOfFrags = 0
        maxFrags = {}

        for ch in allFragsDic:
            maxFrags[ch] = max(
                [int(i) - resolution / 2 for i in allFragsDic[ch]])
            noOfFrags += len(allFragsDic[ch])
            maxPossibleGenomicDist = max(maxPossibleGenomicDist, maxFrags[ch])

        for ch in sorted(allFragsDic.keys()):
            maxFrag = maxFrags[ch]
            n = len(allFragsDic[ch])
            d = 0
            binTracker = 0
            possibleIntraInRangeCountPerChr = 0
            for intxnDistance in range(0, int(maxFrag + 1), resolution):
                npairs = n - d
                d += 1
                if myUtils.in_range_check(intxnDistance, distLowThres,
                                          distUpThres):
                    minPossibleGenomicDist = min(minPossibleGenomicDist,
                                                 intxnDistance)
                    possibleIntraInRangeCountPerChr += npairs
                else:
                    continue
                currBin = binStats[binTracker]
                minOfBin = currBin[0][0]
                maxOfBin = currBin[0][1]
                while not (minOfBin <= intxnDistance <= maxOfBin):
                    binTracker += 1
                    if binTracker not in binStats:
                        binTracker -= 1
                        currBin = binStats[binTracker]
                        minOfBin = currBin[0][0]
                        maxOfBin = currBin[0][1]
                        break
                    else:
                        currBin = binStats[binTracker]
                        minOfBin = currBin[0][0]
                        maxOfBin = currBin[0][1]
                currBin[7] += npairs
                currBin[1] += npairs
                currBin[3] += (float(intxnDistance / distScaling) * npairs)
            # possibleIntraInRangeCountPerChr += npairs
            possibleInterAllCount += n * (noOfFrags - n)
            possibleIntraAllCount += (n *
                                      (n + 1)) / 2  # n(n-1) if excluding self
            with open(logfile, 'a') as log:
                log.write("Chromosome " +repr(ch) +",\t"+str(n) +" mappable fragments, \t"+str(possibleIntraInRangeCountPerChr)\
                +" possible intra-chr fragment pairs in range,\t" + str((noOfFrags-n)*n) +" possible inter-chr fragment pairs\n")
            possibleIntraInRangeCount += possibleIntraInRangeCountPerChr
        possibleInterAllCount /= 2
        try:
            interChrProb = 1.0 / possibleInterAllCount
        except:
            interChrProb = 0
        baselineIntraChrProb = 1.0 / possibleIntraAllCount

    else:
        noOfFrags = 0
        for ch in allFragsDic:
            noOfFrags += len(allFragsDic[ch])

        for ch in sorted(allFragsDic.keys()):
            countIntraPairs = 0
            fragsPerChr = sorted(allFragsDic[ch])
            templen = len(fragsPerChr)
            possibleInterAllCount += (noOfFrags - templen) * templen
            possibleIntraInRangeCountPerChr = 0
            for x in range(templen):
                binTracker = 0
                d = 0
                for y in range(x + 1, templen):
                    intxnDistance = abs(
                        float(fragsPerChr[x]) - float(fragsPerChr[y]))
                    if myUtils.in_range_check(intxnDistance, distLowThres,
                                              distUpThres):
                        possibleIntraInRangeCountPerChr += 1
                    else:
                        continue
                    maxPossibleGenomicDist = max(maxPossibleGenomicDist,
                                                 intxnDistance)
                    minPossibleGenomicDist = min(minPossibleGenomicDist,
                                                 intxnDistance)
                    npairs = templen - d
                    d += 1
                    currBin = binStats[binTracker]
                    minOfBin = currBin[0][0]
                    maxOfBin = currBin[0][1]
                    while not (minOfBin <= intxnDistance <= maxOfBin):
                        binTracker += 1
                        if binTracker not in binStats:
                            binTracker -= 1
                            currBin = binStats[binTracker]
                            minOfBin = currBin[0][0]
                            maxOfBin = currBin[0][1]
                            break
                        else:
                            currBin = binStats[binTracker]
                            minOfBin = currBin[0][0]
                            maxOfBin = currBin[0][1]
                    currBin[7] += npairs
                    currBin[1] += 1
                    currBin[3] += float(intxnDistance / distScaling) * npairs
                    possibleIntraAllCount += 1
            with open(logfile, 'a') as log:
                log.write("Chromosome " +repr(ch) +",\t"+str(templen) +" mappable fragments, \t"+str(possibleIntraInRangeCountPerChr)\
                +" possible intra-chr fragment pairs in range,\t" + str((noOfFrags-templen)*templen) +" possible inter-chr fragment pairs\n")
            possibleIntraInRangeCount += possibleIntraInRangeCountPerChr
        possibleInterAllCount /= 2
        try:
            interChrProb = 1.0 / possibleInterAllCount
        except:
            interChrProb = 0
        baselineIntraChrProb = 1.0 / possibleIntraAllCount
    endT = time.time()
    print("Fragments file read. Time took %s" % (endT - startT))

    with open(logfile, 'a') as log:
        log.write("Number of all fragments= %s\n" % (noOfFrags))
        log.write("Possible, Intra-chr in range: pairs= %s \n" %
                  (possibleIntraInRangeCount))
        log.write("Possible, Intra-chr all: pairs= %s \n" %
                  (possibleIntraAllCount))
        log.write("Possible, Inter-chr all: pairs= %s \n" %
                  (possibleInterAllCount))
        log.write("Desired genomic distance range   [%d %s] \n" %
                  (distLowThres, distUpThres)),
        log.write("Range of possible genomic distances  [%d  %d] \n" %
                  (minPossibleGenomicDist, maxPossibleGenomicDist)),
        log.write("Baseline intrachromosomal probability is %s \n" %
                  (baselineIntraChrProb)),
        log.write("Interchromosomal probability is %s \n" % (interChrProb)),

    return (binStats, noOfFrags, maxPossibleGenomicDist,
            possibleIntraInRangeCount, possibleInterAllCount, interChrProb,
            baselineIntraChrProb)  # return from generate_FragPairs
def generate_FragPairs(mainDic,infilename): # lowMappThres
	print("\nEnumerating all possible intra-chromosomal fragment pairs in-range\n"),
	print("------------------------------------------------------------------------------------\n"),
	global maxPossibleGenomicDist
	global possibleIntraAllCount
	global possibleInterAllCount
	global possibleIntraInRangeCount
	global interChrProb
	global baselineIntraChrProb
	#badFrags=[]
	allFragsDic={}
	#allFragsDicReverse={}
	infile=gzip.open(infilename,'r')
	indx=0
	for line in infile:
		words=line.split()
		currChr=words[0]; currMid=words[1]; mapp=float(words[3]);
		if currChr not in allFragsDic:
			allFragsDic[currChr]={}
		allFragsDic[currChr][currMid]=indx
	#	allFragsDicReverse[indx]=[currChr,currMid]
		#if mapp<=lowMappThres:
		#	badFrags.append(indx)
		indx+=1
	#END
	infile.close()

	noOfFrags=0
	maxFrags={}
	for ch in allFragsDic:
		maxFrags[ch]=max([int(i)-resolution/2 for i in allFragsDic[ch]])
		noOfFrags+=len(allFragsDic[ch])
		maxPossibleGenomicDist=max(maxPossibleGenomicDist,maxFrags[ch])
	#print badFrags

	for i in range(0,maxPossibleGenomicDist+1,resolution):
		mainDic[i]=[0,0]

	for ch in allFragsDic:
		maxFrag=maxFrags[ch]
		n=len(allFragsDic[ch])
		d=0
		for i in range(0,maxFrag+1,resolution):
			mainDic[i][0]+=n-d
			d+=1
		#
		possibleInterAllCount+=n*(noOfFrags-n)
		possibleIntraAllCount+=(n*(n+1))/2 # n(n-1) if excluding self
	#
	possibleInterAllCount/=2
	interChrProb=1.0/possibleInterAllCount
	baselineIntraChrProb=1.0/possibleIntraAllCount
	
	for i in range(0,maxPossibleGenomicDist+1,resolution):
		if myUtils.in_range_check(i,distLowThres,distUpThres):
			possibleIntraInRangeCount+=mainDic[i][0]
		#print str(i)+"\t"+str(mainDic[i][0])

	print("Number of all fragments= "+str(noOfFrags)+"\t resolution= "+ str(resolution))
	print("Possible, Intra-chr in range: pairs= "+str(possibleIntraInRangeCount))
	print("Possible, Intra-chr all: pairs= "+str(possibleIntraAllCount)) 
	print("Possible, Inter-chr all: pairs= "+str(possibleInterAllCount))
	print("Desired genomic distance range	[%d %d]" % (distLowThres,distUpThres) + "\n"),
	print("Range of possible genomic distances	[0	%d]" % (maxPossibleGenomicDist) + "\n"),

	return (mainDic,noOfFrags) # return from generate_FragPairs
Example #5
0
def fit_Spline(x, y, yerr, infilename, sortedInteractions, biasDic, figname,
               passNo):
    sys.stderr.write("\nFit a univariate spline to the probability means\n")
    sys.stderr.write(
        "------------------------------------------------------------------------------------\n"
    )
    sys.stderr.write("baseline intra-chr probability: " +
                     repr(baselineIntraChrProb) +
                     "\tbaseline inter-chr probability: " +
                     repr(baselineInterChrProb) + "\n")
    # xi and yi will be used only for visualization purposes
    # acutal fit and residual is all done on vectors x and y
    xi = np.linspace(min(x), max(x), overSample * len(x))

    # assume residualFactor==-1:
    splineError = min(y) * min(y)

    # use fitpack2 method -fit on the real x and y from equal occupancy binning
    ius = UnivariateSpline(x, y, s=splineError)
    yi = ius(xi)

    #### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING
    ### NOW I DO THIS BY CALLING AN R function CALLED MONOREG
    ### This does the isotonic regression using option antitonic to make sure
    ### I get monotonically decreasing probabilites with increasion genomic distance

    tempMaxX = max(x)
    tempMinX = min(x)
    tempList = sorted(list(set([int(i[0]) for i in sortedInteractions])))
    splineX = []
    ### The below for loop will make sure nothing is out of range of [min(x) max(x)]
    ### Therefore everything will be within the range where the spline is defined
    for i in tempList:
        if tempMinX <= i and i <= tempMaxX:
            splineX.append(i)
    # END for
    #print len(splineX)
    splineY = ius(splineX)

    # R vector format
    rSplineX = ro.FloatVector(splineX)
    rSplineY = ro.FloatVector(splineY)
    rMonoReg = ro.r['monoreg']
    # do the antitonic regression
    allRres = rMonoReg(rSplineX, rSplineY, type="antitonic")
    rNewSplineY = allRres[3]
    # convert data back to Python format
    newSplineY = []
    diff = []
    diffX = []
    for i in range(len(rNewSplineY)):
        newSplineY.append(rNewSplineY[i])
        if (splineY[i] - newSplineY[i]) > 0:
            diff.append(splineY[i] - newSplineY[i])
            diffX.append(splineX[i])
    # END for
    #print len(splineX)

    residual = sum([i * i for i in (y - ius(x))])

    if visual == True:
        ### Now plot the results
        sys.stderr.write("Plotting %s" % figname + ".png\n")
        plt.clf()
        fig = plt.figure()
        ax = fig.add_subplot(2, 1, 1)
        plt.plot(myUtils.scale_a_list(splineX, toKb),
                 myUtils.scale_a_list(newSplineY, toProb),
                 'g-',
                 label="spline-" + str(passNo),
                 linewidth=2)
        plt.errorbar(myUtils.scale_a_list(x, toKb),
                     myUtils.scale_a_list(y, toProb),
                     myUtils.scale_a_list(yerr, toProb),
                     fmt='r.',
                     label="Mean with std. error",
                     linewidth=2)

        if useInters:
            plt.plot(myUtils.scale_a_list(x, toKb),
                     myUtils.scale_a_list([baselineIntraChrProb for i in x],
                                          toProb),
                     'k-',
                     label="Baseline intra-chromosomal")
            plt.plot(myUtils.scale_a_list(x, toKb),
                     myUtils.scale_a_list([baselineIntraChrProb for i in x],
                                          toProb),
                     'b-',
                     label="Baseline inter-chromosomal")
        plt.ylabel('Contact probability (x10$^{-5}$)', fontsize='large')
        plt.xlabel('Genomic distance (kb)', fontsize='large')
        if distLowThres > -1 and distUpThres > -1:
            plt.xlim(myUtils.scale_a_list([distLowThres, distUpThres], toKb))
        plt.gca().yaxis.set_major_locator(MaxNLocator(nbins=3, prune=None))
        ax.legend(loc="upper right")

        ax = fig.add_subplot(2, 1, 2)

        plt.loglog(splineX, newSplineY, 'g-')
        plt.errorbar(x, y, yerr=yerr, fmt='r.')  # Data
        if useInters:
            plt.loglog(x, [baselineIntraChrProb for i in x], 'k-')
            plt.loglog(x, [baselineIntraChrProb for i in x], 'b-')
        if distLowThres > -1 and distUpThres > -1:
            plt.xlim([distLowThres, distUpThres])
        plt.ylabel('Contact probability (log-scale)', fontsize='large')
        plt.xlabel('Genomic distance (log-scale)', fontsize='large')

        plt.savefig(outdir + '/' + figname + '.png')

    # NOW write the calculated pvalues and corrected pvalues in a file
    infile = gzip.open(infilename, 'r')
    intraInRangeCount = 0
    intraOutOfRangeCount = 0
    intraVeryProximalCount = 0
    interCount = 0
    sys.stderr.write("distLowThres " + repr(distLowThres) + "\tdistUpThres " +
                     repr(distUpThres) + "\n")
    p_vals = []
    q_vals = []
    for line in infile:
        words = line.rstrip().split()
        interxn = myUtils.Interaction(
            [words[0], int(words[1]), words[2],
             int(words[3])])
        interxn.setCount(int(words[4]))
        chr1 = words[0]
        chr2 = words[2]
        midPoint1 = int(words[1])
        midPoint2 = int(words[3])

        bias1 = 1.0
        bias2 = 1.0
        # assumes there is no bias to begin with
        # if the biasDic is not null sets the real bias values
        if len(biasDic) > 0:
            if chr1 in biasDic and midPoint1 in biasDic[chr1]:
                bias1 = biasDic[chr1][midPoint1]
            if chr2 in biasDic and midPoint2 in biasDic[chr2]:
                bias2 = biasDic[chr2][midPoint2]

        if (bias1 < 0 or bias2 < 0) and interxn.type != 'inter':
            prior_p = 1.0
            p_val = 1.0
            p_vals.append(p_val)
        elif interxn.getType(distLowThres, distUpThres) == 'intraInRange':
            # make sure the interaction distance is covered by the probability bins
            distToLookUp = max(interxn.distance, min(x))
            distToLookUp = min(distToLookUp, max(x))
            i = min(bisect.bisect_left(splineX, distToLookUp),
                    len(splineX) - 1)
            #prior_p=newSplineY[i]
            prior_p = newSplineY[i] * (bias1 * bias2
                                       )  # biases added in the picture
            intraInRangeCount += 1
            ############# THIS HAS TO BE interactionCount-1 ##################
            p_val = scsp.bdtrc(interxn.hitCount - 1, observedIntraInRangeSum,
                               prior_p)
            p_vals.append(p_val)

        elif interxn.getType(distLowThres, distUpThres) == 'intraShort':
            prior_p = 1.0
            p_val = 1.0
            intraVeryProximalCount += 1
            p_vals.append(p_val)

        elif interxn.getType(distLowThres, distUpThres) == 'intraLong':
            # out of range bigger than distUpThres
            # use the prior of the baseline intra-chr interaction probability
            prior_p = 1.0  #baselineIntraChrProb*(bias1*bias2)  # biases added in the picture
            p_val = scsp.bdtrc(interxn.hitCount - 1, observedIntraAllSum,
                               prior_p)
            intraOutOfRangeCount += 1
            p_vals.append(p_val)

        else:
            if useInters:
                #prior_p=baselineIntraChrProb
                prior_p = baselineInterChrProb * (
                    bias1 * bias2)  # biases added in the picture
                ############# THIS HAS TO BE interactionCount-1 ##################
                p_val = scsp.bdtrc(interxn.hitCount - 1, observedInterAllSum,
                                   prior_p)
                interCount += 1
                p_vals.append(p_val)
    # END for
    infile.close()

    # Do the BH FDR correction
    if useInters:
        q_vals = myStats.benjamini_hochberg_correction(
            p_vals, possibleInterAllCount + possibleIntraAllCount)
        sys.stderr.write("possibleInterAllCount+possibleIntraAllCount " +
                         repr(possibleInterAllCount + possibleIntraAllCount) +
                         "\n")
    else:
        q_vals = myStats.benjamini_hochberg_correction(
            p_vals, possibleIntraInRangeCount)
        sys.stderr.write("possibleIntraInRangeCount " +
                         repr(possibleIntraInRangeCount) + "\n")

    infile = gzip.open(infilename, 'r')
    outfile = gzip.open(outdir + '/' + figname + '.significances.txt.gz', 'w')
    sys.stderr.write("Writing p-values to file %s" % figname +
                     ".significances.txt.gz\n")
    count = 0
    outfile.write(
        "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n"
    )

    for line in infile:
        words = line.rstrip().split()
        chrNo1 = words[0]
        midPoint1 = int(words[1])
        chrNo2 = words[2]
        midPoint2 = int(words[3])
        interactionCount = int(words[4])
        p_val = p_vals[count]
        q_val = q_vals[count]

        if useInters == False and chrNo1 == chrNo2:  # intra
            interactionDistance = abs(midPoint1 - midPoint2)  # dist
            if myUtils.in_range_check(interactionDistance, distLowThres,
                                      distUpThres):
                outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" %
                              (str(chrNo1), midPoint1, str(chrNo2), midPoint2,
                               interactionCount, p_val, q_val))
        elif useInters == True and chrNo1 != chrNo2:
            outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" %
                          (str(chrNo1), midPoint1, str(chrNo2), midPoint2,
                           interactionCount, p_val, q_val))
        #outfile.write("ALL\t%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val))

        count += 1
    # END for - printing pvals and qvals for all the interactions
    outfile.close()

    isOutlier = []
    distsBelow = []
    distsAbove = []
    intcountsBelow = []
    intcountsAbove = []
    belowThresCount = 0
    aboveThresCount = 0
    outlierThres = 1.0 / possibleIntraInRangeCount
    for interactionDistance, interactionCount, bias12 in sortedInteractions:
        # make sure the interaction distance is covered by the probability bins
        distToLookUp = max(interactionDistance, min(x))
        distToLookUp = min(distToLookUp, max(x))
        i = min(bisect.bisect_left(splineX, distToLookUp), len(splineX) - 1)
        prior_p = newSplineY[i] * float(bias12)  # biases added in the picture
        ############# THIS HAS TO BE interactionCount-1 ##################
        p_val = scsp.bdtrc(interactionCount - 1, observedIntraInRangeSum,
                           prior_p)
        if p_val < outlierThres:
            distsBelow.append(interactionDistance)
            intcountsBelow.append(interactionCount)
            isOutlier.append(1)
            belowThresCount += 1
        else:
            distsAbove.append(interactionDistance)
            intcountsAbove.append(interactionCount)
            isOutlier.append(0)
            aboveThresCount += 1
    # END for - doing the outlier check for all interactions in sortedInteractions

    if visual == True:
        sys.stderr.write("Plotting results of extracting outliers to file %s" %
                         figname + ".extractOutliers.png\n")
        plt.clf()
        fig = plt.figure()
        ax = fig.add_subplot(111)
        downsample = 30  # for the non-outliers
        randIndcsAbove = sample([i for i in range(len(intcountsAbove))],
                                len(intcountsAbove) / downsample)
        randIndcsAbove = sorted(randIndcsAbove)
        downsample = 20  # for the outliers
        randIndcsBelow = sample([i for i in range(len(intcountsBelow))],
                                len(intcountsBelow) / downsample)
        randIndcsBelow = sorted(randIndcsBelow)

        plt.plot(myUtils.scale_a_list([distsBelow[i] for i in randIndcsBelow],
                                      toKb),
                 [intcountsBelow[i] for i in randIndcsBelow],
                 'r.',
                 label="Outliers (p-value < 1/M)")
        plt.plot(myUtils.scale_a_list(splineX + [maxObservedGenomicDist],
                                      toKb),
                 [
                     newSplineY[i] * observedIntraInRangeSum
                     for i in range(len(newSplineY))
                 ] + [newSplineY[-1] * observedIntraInRangeSum],
                 'g-',
                 label="spline-" + str(passNo) + " (x N)",
                 linewidth=2.5)

        plt.xlabel('Genomic distance (kb)')
        plt.ylabel('Contact counts')
        print(repr(len(intcountsBelow)) + "\t"),
        ## this limits y-axis of the hit count plots
        if len(intcountsBelow) > 0:
            plt.ylim([0, min(max(intcountsBelow), 1500)])
        if distLowThres > -1 and distUpThres > -1:
            plt.xlim([0, distUpThres * toKb])
        ax.legend(loc="upper right", fancybox=True)
        plt.savefig(outdir + '/' + figname + '.extractOutliers.png')

    sys.stderr.write("intraInRangeCount " + repr(intraInRangeCount)+"\tintraOutOfRangeCount " +\
     repr(intraOutOfRangeCount)+"\tintraVeryProximalCount " + repr(intraVeryProximalCount) +"\tinterCount " + repr(interCount)+"\n")

    if visual == True:
        sys.stderr.write("Plotting q-values to file %s" % figname +
                         ".qplot.png\n")
    minFDR = 0.0
    maxFDR = 0.05
    increment = 0.001
    FDRx, FDRy = plot_qvalues(q_vals, minFDR, maxFDR, increment,
                              figname + ".qplot")

    infile.close()

    return [splineX, newSplineY, residual, isOutlier, FDRx,
            FDRy]  # from fit_Spline
def generate_FragPairs(mainDic, infilename):  # lowMappThres
    print(
        "\nEnumerating all possible intra-chromosomal fragment pairs in-range\n"
    ),
    print(
        "------------------------------------------------------------------------------------\n"
    ),
    global maxPossibleGenomicDist
    global possibleIntraAllCount
    global possibleInterAllCount
    global possibleIntraInRangeCount
    global interChrProb
    global baselineIntraChrProb
    #badFrags=[]
    allFragsDic = {}
    #allFragsDicReverse={}
    infile = gzip.open(infilename, 'r')
    indx = 0
    for line in infile:
        words = line.split()
        currChr = words[0]
        currMid = words[1]
        mapp = float(words[3])
        if currChr not in allFragsDic:
            allFragsDic[currChr] = {}
        allFragsDic[currChr][currMid] = indx
        #	allFragsDicReverse[indx]=[currChr,currMid]
        #if mapp<=lowMappThres:
        #	badFrags.append(indx)
        indx += 1
    #END
    infile.close()

    noOfFrags = 0
    maxFrags = {}
    for ch in allFragsDic:
        maxFrags[ch] = max([int(i) - resolution / 2 for i in allFragsDic[ch]])
        noOfFrags += len(allFragsDic[ch])
        maxPossibleGenomicDist = max(maxPossibleGenomicDist, maxFrags[ch])
    #print badFrags

    for i in range(0, maxPossibleGenomicDist + 1, resolution):
        mainDic[i] = [0, 0]

    for ch in allFragsDic:
        maxFrag = maxFrags[ch]
        n = len(allFragsDic[ch])
        d = 0
        for i in range(0, maxFrag + 1, resolution):
            mainDic[i][0] += n - d
            d += 1
        #
        possibleInterAllCount += n * (noOfFrags - n)
        possibleIntraAllCount += (n * (n + 1)) / 2  # n(n-1) if excluding self
    #
    possibleInterAllCount /= 2
    interChrProb = 1.0 / possibleInterAllCount
    baselineIntraChrProb = 1.0 / possibleIntraAllCount

    for i in range(0, maxPossibleGenomicDist + 1, resolution):
        if myUtils.in_range_check(i, distLowThres, distUpThres):
            possibleIntraInRangeCount += mainDic[i][0]
        #print str(i)+"\t"+str(mainDic[i][0])

    print("Number of all fragments= " + str(noOfFrags) + "\t resolution= " +
          str(resolution))
    print("Possible, Intra-chr in range: pairs= " +
          str(possibleIntraInRangeCount))
    print("Possible, Intra-chr all: pairs= " + str(possibleIntraAllCount))
    print("Possible, Inter-chr all: pairs= " + str(possibleInterAllCount))
    print(
        "Desired genomic distance range	[%d %d]" %
        (distLowThres, distUpThres) + "\n"),
    print(
        "Range of possible genomic distances	[0	%d]" %
        (maxPossibleGenomicDist) + "\n"),

    return (mainDic, noOfFrags)  # return from generate_FragPairs
def calculate_Probabilities(mainDic,outfilename):
	print("\nCalculating probability means and standard deviations by equal-occupancy binning of contact counts\n"),
	print("------------------------------------------------------------------------------------\n"),
	outfile=open(outfilename+'.res'+str(resolution)+'.txt', 'w')

	## total interaction count to put on top of the plot
	#totalInteractionCountForPlot=0
	#for i in range(0,maxPossibleGenomicDist+1,resolution):
	#	totalInteractionCountForPlot += mainDic[i][1]
	#totalInteractionCountForPlot/=2

	desiredPerBin=(observedIntraInRangeSum)/noOfBins
	print("observed intra-chr read counts in range\t"+repr(observedIntraInRangeSum)+ ",\tdesired number of contacts per bin\t" +repr(desiredPerBin)+",\tnumber of bins\t"+repr(noOfBins)+"\n"),

	# the following five lists will be the print outputs
	x=[] # avg genomic distances of bins
	y=[] # avg interaction probabilities of bins
	yerr=[] # stderrs of bins
	pairCounts=[] # number of pairs in bins
	interactionTotals=[] # number of interactions (reads) in bins
	interactionTotalForBinTermination=0
	n=0 # bin counter so far
	totalInteractionCountSoFar=0
	#observedIntraInRangeSum
	distsToGoInAbin=[]
	binFull=0
	for i in range(0,maxPossibleGenomicDist+1,resolution):
		totalInteractionCountSoFar+=mainDic[i][1]
		if myUtils.in_range_check(i,distLowThres,distUpThres)==False:
			continue
		# if one distance has more than necessary counts to fill a bin
		if mainDic[i][1]>=desiredPerBin: 
			distsToGoInAbin.append(i)
			interactionTotalForBinTermination=0
			binFull=1
		# if adding the next bin will fill the bin
		elif interactionTotalForBinTermination+mainDic[i][1] >= desiredPerBin:
			distsToGoInAbin.append(i)
			interactionTotalForBinTermination=0
			binFull=1
		# if adding the next bin will fill the bin
		else:
			distsToGoInAbin.append(i)
			interactionTotalForBinTermination+=mainDic[i][1]
		#
		if binFull==1:
			noOfPairsForBin=0
			interactionTotalForBin=0
			avgDistance=0
			# dynamically update the desiredPerBin after each bin is full
			n+=1
			if n<noOfBins:
				desiredPerBin=1.0*(observedIntraInRangeSum-totalInteractionCountSoFar)/(noOfBins-n)
			se_p=0 # for now I'm not worrying about error etc.
			for b in distsToGoInAbin:
				noOfPairsForBin+=mainDic[b][0]
				interactionTotalForBin+=mainDic[b][1]
				avgDistance+=1.0*mainDic[b][0]*(b/distScaling)
			#
			meanProbabilityObsv=(1.0*interactionTotalForBin/noOfPairsForBin)/observedIntraInRangeSum
			avgDistance=distScaling*(avgDistance/noOfPairsForBin)
			# append this bin
			x.append(float(avgDistance))
			y.append(float(meanProbabilityObsv))
			yerr.append(float(se_p))
			pairCounts.append(noOfPairsForBin)
			interactionTotals.append(interactionTotalForBin)
			
			print "%d" % n+ "\t" + "%f" % avgDistance + "\t"+"%.2e" % meanProbabilityObsv + "\t"\
				+ "%.2e" % se_p +"\t" +"%d" % noOfPairsForBin +"\t" +"%d" % interactionTotalForBin
			# reset counts
			interactionTotalForBinTermination=0
			binFull=0
			distsToGoInAbin=[]
		# END if
	# END for
	print("Writing equal-occupancy binning results to %s" % outfilename + ".txt\n"),
	outfile.write("avgGenomicDist\tcontactProbability\tstandardError\tnoOfLocusPairs\ttotalOfContactCounts\n")
	for i in range(len(x)):
		outfile.write("%d" % x[i] + "\t"+"%.2e" % y[i]+ "\t" + "%.2e" % yerr[i] + "\t" +"%d" % pairCounts[i] + "\t" +"%d" % interactionTotals[i]+"\n")
	outfile.close()
	return [x,y,yerr] # from calculate_Probabilities
def calculate_Probabilities(mainDic, outfilename):
    print(
        "\nCalculating probability means and standard deviations by equal-occupancy binning of contact counts\n"
    ),
    print(
        "------------------------------------------------------------------------------------\n"
    ),
    outfile = open(outfilename + '.res' + str(resolution) + '.txt', 'w')

    ## total interaction count to put on top of the plot
    #totalInteractionCountForPlot=0
    #for i in range(0,maxPossibleGenomicDist+1,resolution):
    #	totalInteractionCountForPlot += mainDic[i][1]
    #totalInteractionCountForPlot/=2

    desiredPerBin = (observedIntraInRangeSum) / noOfBins
    print("observed intra-chr read counts in range\t" +
          repr(observedIntraInRangeSum) +
          ",\tdesired number of contacts per bin\t" + repr(desiredPerBin) +
          ",\tnumber of bins\t" + repr(noOfBins) + "\n"),

    # the following five lists will be the print outputs
    x = []  # avg genomic distances of bins
    y = []  # avg interaction probabilities of bins
    yerr = []  # stderrs of bins
    pairCounts = []  # number of pairs in bins
    interactionTotals = []  # number of interactions (reads) in bins
    interactionTotalForBinTermination = 0
    n = 0  # bin counter so far
    totalInteractionCountSoFar = 0
    #observedIntraInRangeSum
    distsToGoInAbin = []
    binFull = 0
    for i in range(0, maxPossibleGenomicDist + 1, resolution):
        totalInteractionCountSoFar += mainDic[i][1]
        if myUtils.in_range_check(i, distLowThres, distUpThres) == False:
            continue
        # if one distance has more than necessary counts to fill a bin
        if mainDic[i][1] >= desiredPerBin:
            distsToGoInAbin.append(i)
            interactionTotalForBinTermination = 0
            binFull = 1
        # if adding the next bin will fill the bin
        elif interactionTotalForBinTermination + mainDic[i][1] >= desiredPerBin:
            distsToGoInAbin.append(i)
            interactionTotalForBinTermination = 0
            binFull = 1
        # if adding the next bin will fill the bin
        else:
            distsToGoInAbin.append(i)
            interactionTotalForBinTermination += mainDic[i][1]
        #
        if binFull == 1:
            noOfPairsForBin = 0
            interactionTotalForBin = 0
            avgDistance = 0
            # dynamically update the desiredPerBin after each bin is full
            n += 1
            if n < noOfBins:
                desiredPerBin = 1.0 * (observedIntraInRangeSum -
                                       totalInteractionCountSoFar) / (
                                           noOfBins - n)
            se_p = 0  # for now I'm not worrying about error etc.
            for b in distsToGoInAbin:
                noOfPairsForBin += mainDic[b][0]
                interactionTotalForBin += mainDic[b][1]
                avgDistance += 1.0 * mainDic[b][0] * (b / distScaling)
            #
            meanProbabilityObsv = (1.0 * interactionTotalForBin /
                                   noOfPairsForBin) / observedIntraInRangeSum
            avgDistance = distScaling * (avgDistance / noOfPairsForBin)
            # append this bin
            x.append(float(avgDistance))
            y.append(float(meanProbabilityObsv))
            yerr.append(float(se_p))
            pairCounts.append(noOfPairsForBin)
            interactionTotals.append(interactionTotalForBin)

            print "%d" % n+ "\t" + "%f" % avgDistance + "\t"+"%.2e" % meanProbabilityObsv + "\t"\
             + "%.2e" % se_p +"\t" +"%d" % noOfPairsForBin +"\t" +"%d" % interactionTotalForBin
            # reset counts
            interactionTotalForBinTermination = 0
            binFull = 0
            distsToGoInAbin = []
        # END if
    # END for
    print("Writing equal-occupancy binning results to %s" % outfilename +
          ".txt\n"),
    outfile.write(
        "avgGenomicDist\tcontactProbability\tstandardError\tnoOfLocusPairs\ttotalOfContactCounts\n"
    )
    for i in range(len(x)):
        outfile.write("%d" % x[i] + "\t" + "%.2e" % y[i] + "\t" +
                      "%.2e" % yerr[i] + "\t" + "%d" % pairCounts[i] + "\t" +
                      "%d" % interactionTotals[i] + "\n")
    outfile.close()
    return [x, y, yerr]  # from calculate_Probabilities
Example #9
0
def fit_Spline(mainDic,x,y,yerr,infilename,outfilename,biasDic,outliersline,outliersdist,observedIntraInRangeSum, possibleIntraInRangeCount, possibleInterAllCount, observedIntraAllSum, observedInterAllSum, resolution, passNo):
    with open(logfile, 'a') as log:
        log.write("\nFitting a univariate spline to the probability means\n"),
        log.write("------------------------------------------------------------------------------------\n"),
   
    splineX = None
    newSplineY = None
    residual = None 
    FDRx = None
    FDRy = None

    if not interOnly:
        if outliersdist != None:
            y = [f for _, f in sorted(zip(x,y), key=lambda pair: pair[0])]
            x.sort()
        for i in range(1,len(x)):
            if x[i]<=x[i-1]:
                print("ERROR in spline fitting. Distances do not decrease across bins. Ensure interaction file is correct.")
                print("Avg. distance of bin(i-1)... %s" % x[i-1])
                print("Avg. distance of bin(i)... %s" % x[i])
                sys.exit(2)
        
        # maximum residual allowed for spline is set to min(y)^2
        splineError=min(y)*min(y)

        # use fitpack2 method -fit on the real x and y from equal occupancy binning
        ius = UnivariateSpline(x, y, s=splineError)
        tempMaxX=max(x)
        tempMinX=min(x)
        tempList=sorted([dis for dis in mainDic])
        splineX=[]
        ### The below for loop will make sure nothing is out of range of [min(x) max(x)]
        ### Therefore everything will be within the range where the spline is defined
        for i in tempList:
            if tempMinX<=i<=tempMaxX:
                splineX.append(i)
        splineY=ius(splineX)
        #print(splineY)
        #print(yerr)


        ir = IsotonicRegression(increasing=False)
        newSplineY = ir.fit_transform(splineX,splineY)
        #print(newSplineY)
        residual =sum([i*i for i in (y - ius(x))])

        if visual==True:
            xi = np.linspace(min(x),max(x),5*len(x))
            yi = ius(xi)

            print("Plotting %s" % (outfilename + ".png"))
            plt.clf()
            fig = plt.figure()
            ax = fig.add_subplot(2,1,1)
            plt.plot(myUtils.scale_a_list(splineX,toKb), myUtils.scale_a_list(newSplineY,toProb),'g-',label="spline-"+str(passNo),linewidth=2)
            plt.errorbar(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list(y,toProb),myUtils.scale_a_list(yerr,toProb),fmt='r.',label="Mean with std. error",linewidth=2) 
        
            #plt.ylabel('Contact probability (x10$^{-5}$)',fontsize='large')
            #plt.xlabel('Genomic distance (kb)',fontsize='large')
            plt.ylabel('Contact probability (x10$^{-5}$)')
            plt.xlabel('Genomic distance (kb)')
            if distLowThres>0 and distUpThres<float("inf"):
                plt.xlim(myUtils.scale_a_list([distLowThres, distUpThres],toKb))
            plt.gca().yaxis.set_major_locator( MaxNLocator(nbins = 3, prune=None))
            ax.legend(loc="upper right")

            ax = fig.add_subplot(2,1,2)

            plt.loglog(splineX,newSplineY,'g-')
            plt.errorbar(x, y, yerr=yerr, fmt='r.') # Data
            if distLowThres>0 and distUpThres<float("inf"):
                plt.xlim([distLowThres, distUpThres])
            plt.ylabel('Contact probability (log-scale)')
            plt.xlabel('Genomic distance (log-scale)')

            plt.savefig(outfilename+'.png')
            

    # NOW write the calculated pvalues and corrected pvalues in a file
    infile = gzip.open(infilename, 'rt')
    intraInRangeCount=0
    intraOutOfRangeCount=0
    intraVeryProximalCount=0
    interCount=0
    discardCount=0
    p_vals=[]
    q_vals=[]
    biasl=[]
    biasr=[]
    for line in infile:
        ch1,mid1,ch2,mid2,contactCount=line.rstrip().split()
        contactCount = float(contactCount)
        interxn=myUtils.Interaction([ch1, int(mid1), ch2, int(mid2)])
        interxn.setCount(contactCount)
        mid1 = int(mid1); mid2 = int(mid2)
        interactionType = interxn.getType(distLowThres,distUpThres)
        bias1=1.0; bias2=1.0;  # assumes there is no bias to begin with
        # if the biasDic is not null sets the real bias values
        if biasDic:
            if ch1 not in biasDic:
                print("Warning. Bias file does not contain chromosome %s. \
                Please ensure you're using correct file. Fit-Hi-C will continue with\
                bias = -1 for this locus" % ch1)
                bias1 = -1
            else:
                if mid1 not in biasDic[ch1]:
                    print("Error. Bias file does not contain midpoint %s within \
                    %s. Please ensure you're using the correct file and/or resolution \
                    argument. Fit-Hi-C will continue with bias = -1 for this locus" \
                    % (mid1, ch1))
                    bias1 = -1
                else: 
                    bias1=biasDic[ch1][mid1]
            if ch2 not in biasDic:
                print("Warning. Bias file does not contain chromosome %s. \
                Please ensure you're using correct file. Fit-Hi-C will continue with\
                bias = -1 for this locus" % ch2)
                bias2 = -1
            else:
                if mid2 not in biasDic[ch2]:
                    print("Error. Bias file does not contain midpoint %s within \
                    %s. Please ensure you're using the correct file and/or resolution \
                    argument. Fit-Hi-C will continue with bias = -1 for this locus" \
                    % (mid2, ch2))
                    bias2 = -1
                else:
                    bias2=biasDic[ch2][mid2]
        biasl.append(bias1)
        biasr.append(bias2)
        if (bias1<0 or bias2<0) and interactionType !='inter':
            prior_p=1.0
            p_val=1.0
            discardCount+=1
        elif interactionType=='intraInRange' and not interOnly:
            distToLookUp=max(interxn.getDistance(),min(x))
            distToLookUp=min(distToLookUp,max(x))
            i=min(bisect.bisect_left(splineX, distToLookUp),len(splineX)-1)
            prior_p=newSplineY[i]*(bias1*bias2) 
            p_val=scsp.bdtrc(interxn.getCount()-1,observedIntraInRangeSum,prior_p)
            intraInRangeCount +=1
        elif interactionType =='intraShort' and not interOnly:
            prior_p=1.0
            p_val=1.0
            intraVeryProximalCount += 1
        elif interactionType =='intraLong' and not interOnly:
            prior_p=1.0
            #p_val=scsp.bdtrc(interxn.getCount()-1, observedIntraAllSum,prior_p) ##RUNBY
            p_val=1.0
            intraOutOfRangeCount += 1
        else:
            if allReg or interOnly:
                prior_p=interChrProb*(bias1*bias2)
                p_val=scsp.bdtrc(interxn.getCount()-1,observedInterAllSum,prior_p)
                interCount += 1
            else:
                p_val=1.0
                #p_vals.append(p_val)
        p_vals.append(p_val)
    infile.close()

    outlierThres = 0
    # Do the BH FDR correction
    if allReg:
        outlierThres=1.0/(possibleIntraInRangeCount+possibleInterAllCount)
        q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount+possibleIntraInRangeCount)
    elif interOnly and not allReg:
        outlierThres = 1.0/possibleInterAllCount
        q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount)
    else:
        outlierThres = 1.0/possibleIntraInRangeCount
        q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleIntraInRangeCount)
    print("Outlier threshold is... %s" % (outlierThres))

    #now we write the values back to the file
    infile =gzip.open(infilename, 'rt')
    if resolution:
        outfile =gzip.open(outfilename+'.res'+str(resolution)+'.significances.txt.gz', 'wt')
    else:
        outfile =gzip.open(outfilename+'.significances.txt.gz', 'wt')
    print("Writing p-values and q-values to file %s" % (outfilename + ".significances.txt"))
    outfile.write("chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\tbias1\tbias2\n")
    count=0
    for line in infile:
        words=line.rstrip().split()
        chr1=words[0]
        midPoint1=int(words[1])
        chr2=words[2]
        midPoint2=int(words[3])
        interactionCount=float(words[4])
        p_val=p_vals[count]
        q_val=q_vals[count]
        bias1=biasl[count]
        bias2=biasr[count]
        
        if (allReg or interOnly) and chr1!=chr2:
            outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" % (str(chr1), midPoint1, str(chr2), midPoint2, interactionCount, p_val, q_val, bias1, bias2))
        if (allReg or not interOnly) and chr1==chr2:
            interactionDistance = abs(midPoint1-midPoint2)
            if myUtils.in_range_check(interactionDistance,distLowThres, distUpThres):
                outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" % (str(chr1), midPoint1, str(chr2), midPoint2, interactionCount, p_val, q_val, bias1, bias2))
        
        if p_val<outlierThres:
            outliersline.add(count)
            outliersdist.add(abs(midPoint1-midPoint2))
        count+=1
    outfile.close()
    infile.close()
    if visual == True:
        print("Plotting q-values to file %s" % outfilename + ".qplot.png")
    minFDR=0.0
    maxFDR=0.05
    increment=0.001
    FDRx,FDRy=plot_qvalues(q_vals,minFDR,maxFDR,increment,outfilename+".qplot")
        
    with open(logfile, 'a') as log:
        log.write("Spline successfully fit\n"),
        log.write("\n"),
        log.write("\n"),

    return [splineX, newSplineY, residual, outliersline, outliersdist, FDRx, FDRy] # from fit_Spline
Example #10
0
def generate_FragPairs(binStats, fragsfile, resolution): 
    if resolution:
        with open(logfile, 'a') as log:
            log.write("Looping through all possible fragment pairs in-range\n")
            log.write("------------------------------------------------------------------------------------\n"),
    else:
        with open(logfile, 'a') as log:
            log.write("Enumerating all possible fragment pairs in-range\n")
            log.write("------------------------------------------------------------------------------------\n"),
    startT = time.time()

    minPossibleGenomicDist = float("inf")
    maxPossibleGenomicDist = 0
    possibleIntraAllCount = 0
    possibleInterAllCount = 0
    possibleIntraInRangeCount = 0
    interChrProb = 0
    baselineIntraChrProb = 0

    allFragsDic={}
    with gzip.open(fragsfile,'rt') as infile:
        for line in infile:
            words=line.split()
            currChr=words[0]
            currMid=int(words[2])
            currHit=int(words[3])
            if currChr not in allFragsDic:
                allFragsDic[currChr]=[]
            if currHit>=mappThres:
                allFragsDic[currChr].append(currMid)

    if resolution:
        noOfFrags=0
        maxFrags={}
        
        for ch in allFragsDic:
            maxFrags[ch]=max([int(i)-resolution/2 for i in allFragsDic[ch]])
            noOfFrags+=len(allFragsDic[ch])
            maxPossibleGenomicDist=max(maxPossibleGenomicDist,maxFrags[ch])
        
        for ch in sorted(allFragsDic.keys()):
            maxFrag=maxFrags[ch]
            n=len(allFragsDic[ch])
            d=0
            binTracker = 0
            possibleIntraInRangeCountPerChr = 0
            for intxnDistance in range(0,int(maxFrag+1),resolution):
                npairs = n-d
                d+=1
                if myUtils.in_range_check(intxnDistance,distLowThres,distUpThres):
                    minPossibleGenomicDist = min(minPossibleGenomicDist, intxnDistance)
                    possibleIntraInRangeCountPerChr += npairs
                else:
                    continue
                currBin = binStats[binTracker]
                minOfBin = currBin[0][0]
                maxOfBin = currBin[0][1]
                while not (minOfBin<=intxnDistance<=maxOfBin):
                    binTracker += 1
                    if binTracker not in binStats:
                        binTracker-=1
                        currBin = binStats[binTracker]
                        minOfBin = currBin[0][0]
                        maxOfBin = currBin[0][1]
                        break
                    else:
                        currBin = binStats[binTracker]
                        minOfBin = currBin[0][0]
                        maxOfBin = currBin[0][1]
                currBin[7]+=npairs
                currBin[1]+=npairs
                currBin[3]+=(float(intxnDistance/distScaling)*npairs)
               # possibleIntraInRangeCountPerChr += npairs
            possibleInterAllCount+=n*(noOfFrags-n)
            possibleIntraAllCount+=(n*(n+1))/2 # n(n-1) if excluding self
            with open(logfile, 'a') as log:
                log.write("Chromosome " +repr(ch) +",\t"+str(n) +" mappable fragments, \t"+str(possibleIntraInRangeCountPerChr)\
                +" possible intra-chr fragment pairs in range,\t" + str((noOfFrags-n)*n) +" possible inter-chr fragment pairs\n")
            possibleIntraInRangeCount += possibleIntraInRangeCountPerChr
        possibleInterAllCount/=2
        try:
            interChrProb=1.0/possibleInterAllCount
        except:
            interChrProb = 0
        baselineIntraChrProb=1.0/possibleIntraAllCount

    else:
        noOfFrags = 0
        for ch in allFragsDic:
            noOfFrags += len(allFragsDic[ch])

        for ch in sorted(allFragsDic.keys()):
            countIntraPairs = 0
            fragsPerChr = sorted(allFragsDic[ch])
            templen = len(fragsPerChr)
            possibleInterAllCount += (noOfFrags-templen)*templen
            possibleIntraInRangeCountPerChr = 0
            for x in range(templen):
                binTracker = 0
                d = 0
                for y in range(x+1,templen):
                    intxnDistance = abs(float(fragsPerChr[x])-float(fragsPerChr[y]))
                    if myUtils.in_range_check(intxnDistance, distLowThres,distUpThres):
                        possibleIntraInRangeCountPerChr += 1 
                    else:
                        continue
                    maxPossibleGenomicDist = max(maxPossibleGenomicDist, intxnDistance)
                    minPossibleGenomicDist = min(minPossibleGenomicDist, intxnDistance)
                    npairs = templen-d
                    d+=1
                    currBin = binStats[binTracker]
                    minOfBin = currBin[0][0]
                    maxOfBin = currBin[0][1]
                    while not (minOfBin<=intxnDistance<=maxOfBin):
                        binTracker += 1
                        if binTracker not in binStats:
                            binTracker-=1
                            currBin = binStats[binTracker]
                            minOfBin = currBin[0][0]
                            maxOfBin = currBin[0][1]
                            break
                        else:
                            currBin = binStats[binTracker]
                            minOfBin = currBin[0][0]
                            maxOfBin = currBin[0][1]
                    currBin[7]+=npairs
                    currBin[1]+=1
                    currBin[3]+=float(intxnDistance/distScaling)*npairs
                    possibleIntraAllCount += 1
            with open(logfile, 'a') as log:
                log.write("Chromosome " +repr(ch) +",\t"+str(templen) +" mappable fragments, \t"+str(possibleIntraInRangeCountPerChr)\
                +" possible intra-chr fragment pairs in range,\t" + str((noOfFrags-templen)*templen) +" possible inter-chr fragment pairs\n")
            possibleIntraInRangeCount += possibleIntraInRangeCountPerChr
        possibleInterAllCount/=2
        try:
            interChrProb=1.0/possibleInterAllCount
        except:
            interChrProb = 0
        baselineIntraChrProb=1.0/possibleIntraAllCount
    endT = time.time()
    print("Fragments file read. Time took %s" % (endT-startT))

    with open(logfile, 'a') as log:
        log.write("Number of all fragments= %s\n" % (noOfFrags))
        log.write("Possible, Intra-chr in range: pairs= %s \n" % (possibleIntraInRangeCount))
        log.write("Possible, Intra-chr all: pairs= %s \n" % (possibleIntraAllCount))
        log.write("Possible, Inter-chr all: pairs= %s \n" % (possibleInterAllCount))
        log.write("Desired genomic distance range   [%d %s] \n" % (distLowThres,distUpThres)),
        log.write("Range of possible genomic distances  [%d  %d] \n" % (minPossibleGenomicDist, maxPossibleGenomicDist)),
        log.write("Baseline intrachromosomal probability is %s \n" % (baselineIntraChrProb)),
        log.write("Interchromosomal probability is %s \n" % (interChrProb)),

    return (binStats,noOfFrags, maxPossibleGenomicDist, possibleIntraInRangeCount, possibleInterAllCount, interChrProb, baselineIntraChrProb) # return from generate_FragPairs
Example #11
0
def generate_FragPairs(infilename):
	sys.stderr.write("\nGenerating all possible intra-chromosomal fragment pairs and counting the number of all possible inter-chr fragment pairs\n")
	sys.stderr.write("------------------------------------------------------------------------------------\n")
	global listOfMappableFrags # two dimensional list with all mappable fragment midpoints for each chr
	global chrList # list of all chromosomes (chrno (type=int))
	global possiblePairsPerDistance # all possible intra-chr fragment pairs
	global possibleInterAllCount # count of all possible inter-chr fragment pairs
	global possibleIntraAllCount # count of all possible intra-chr fragment pairs
	global possibleIntraInRangeCount # count of all possible intra-chr fragment pairs in the range we're interested
	global baselineInterChrProb # 1 divided by all possible inter-chr fragment pairs 
	global baselineIntraChrProb #  1 divided by all possible intra-chr fragment pairs

	listOfMappableFrags=[]
	chrList=[]

	#get the name of the first chr
	infile =gzip.open(infilename, 'r')
	line=infile.readline()
	words=line.rstrip().split()
	currChrNo=words[0] #get the name of first chr
	infile.close()

	# read the fragments file 
	fragsPerChr=[] # temporary list that will be added to listOfMappableFrags for each chr
	totalNoOfFrags=0 # total number of all mappable fragments
	infile =gzip.open(infilename, 'r')
	for line in infile:
		words=line.rstrip().split()
		chrNo=words[0] # can be an integer or a string
		#words[1] ignored
		midPoint=int(words[2])
		hitCount=int(words[3])
		# whenever the name of the chromosome changes 
		if currChrNo!=chrNo:
			listOfMappableFrags.append(fragsPerChr)
			totalNoOfFrags += len(fragsPerChr)
			chrList.append(currChrNo)
			currChrNo = chrNo
			fragsPerChr=[]
		# add the mappable midPoints to the temp fragsPerChr
		if hitCount >= mappabilityThreshold:
			fragsPerChr.append(midPoint)
	#END for

	# handle the last chromosome
	listOfMappableFrags.append(fragsPerChr)
	totalNoOfFrags += len(fragsPerChr)
	chrList.append(currChrNo)
	infile.close()
	
	# create all possible frag pairs 
	possibleInterAllCount=0
	possibleIntraInRangeCount=0
	possibleIntraAllCount=0
	for i in chrList:
		countIntraPairs=0
		chrIndex=chrList.index(i) # get the index of chromosome from the chrList 
		fragsPerChr=(listOfMappableFrags[chrIndex])[:] # get the mappable midpoints for that chr
		tempLen=len(fragsPerChr)
		possibleInterAllCount+= (totalNoOfFrags-tempLen)*tempLen
		# iterate over all possible intra-chr pairs to see which ones qualify as a 'possible' pair
		for x in range(tempLen):
			for y in range(x+1,tempLen):
				interactionDistance=abs(fragsPerChr[x]-fragsPerChr[y])
				if myUtils.in_range_check(interactionDistance,distLowThres,distUpThres):
					countIntraPairs +=1
					dictkey=str(i)+'-'+str(min(fragsPerChr[x],fragsPerChr[y]))+'-'+str(max(fragsPerChr[x],fragsPerChr[y]))
					possiblePairsPerDistance[dictkey]=[interactionDistance,0,1.0] # set count to zero for now and bias to 1.0
				possibleIntraAllCount+=1
			#END for
		#END for
		possibleIntraInRangeCount+=countIntraPairs
		sys.stderr.write("Chromosome " +repr(i) +",\t"+str(tempLen) +" mappable fragments, \t"+str(countIntraPairs)\
		+" possible intra-chr fragment pairs in range,\t" + str((totalNoOfFrags-tempLen)*tempLen) +" possible inter-chr fragment pairs\n")
	#END for

	# divide the possibleInterAllCount by 2 so that every inter-chr interaction is counted only once
	possibleInterAllCount=possibleInterAllCount/2
	sys.stderr.write("Total of \t"+str(possibleIntraInRangeCount) +" possible intra-chr fragment pairs in range,\t"\
	+str(possibleIntraAllCount) +" possible intra-chr fragment pairs,\t"\
	+str(possibleInterAllCount) +" possible inter-chr fragment pairs\n")
	# calculate inter-chr probabilities
	if possibleInterAllCount >0:
		baselineInterChrProb=1.0/possibleInterAllCount
	baselineIntraChrProb=1.0/possibleIntraAllCount

	return # from generate_FragPairs
Example #12
0
def fit_Spline(x,y,yerr,infilename,sortedInteractions,biasDic,figname,passNo):
	sys.stderr.write("\nFit a univariate spline to the probability means\n")
	sys.stderr.write("------------------------------------------------------------------------------------\n")
	sys.stderr.write("baseline intra-chr probability: " + repr(baselineIntraChrProb)+ "\tbaseline inter-chr probability: " + repr(baselineInterChrProb)+"\n")
	# xi and yi will be used only for visualization purposes
	# acutal fit and residual is all done on vectors x and y
	xi = np.linspace(min(x), max(x), overSample*len(x))

	# assume residualFactor==-1: 
	splineError=min(y)*min(y)

	# use fitpack2 method -fit on the real x and y from equal occupancy binning
	ius = UnivariateSpline(x, y, s=splineError)
	yi = ius(xi)

	#### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING
	### NOW I DO THIS BY CALLING AN R function CALLED MONOREG 
	### This does the isotonic regression using option antitonic to make sure 
	### I get monotonically decreasing probabilites with increasion genomic distance 

	tempMaxX=max(x)
	tempMinX=min(x)
	tempList=sorted(list(set([int(i[0]) for i in sortedInteractions])))
	splineX=[]
	### The below for loop will make sure nothing is out of range of [min(x) max(x)]
	### Therefore everything will be within the range where the spline is defined
	for i in tempList:
		if tempMinX<=i and i<=tempMaxX:
			splineX.append(i)
	# END for
	#print len(splineX)
	splineY=ius(splineX)

	# R vector format
	rSplineX=ro.FloatVector(splineX)
	rSplineY=ro.FloatVector(splineY)
	rMonoReg=ro.r['monoreg']
	# do the antitonic regression
	allRres=rMonoReg(rSplineX,rSplineY,type="antitonic")
	rNewSplineY=allRres[3]
	# convert data back to Python format
	newSplineY=[]
	diff=[]
	diffX=[]
	for i in range(len(rNewSplineY)):
		newSplineY.append(rNewSplineY[i])
		if (splineY[i]-newSplineY[i]) > 0:
			diff.append(splineY[i]-newSplineY[i])
			diffX.append(splineX[i])
	# END for
	#print len(splineX)
	
	residual =sum([i*i for i in (y - ius(x))])

	if visual==True:
		### Now plot the results
		sys.stderr.write("Plotting %s" % figname + ".png\n")
		plt.clf()
		fig = plt.figure()
		ax = fig.add_subplot(2,1,1)
		plt.plot(myUtils.scale_a_list(splineX,toKb), myUtils.scale_a_list(newSplineY,toProb),'g-',label="spline-"+str(passNo),linewidth=2)
		plt.errorbar(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list(y,toProb),myUtils.scale_a_list(yerr,toProb),fmt='r.',label="Mean with std. error",linewidth=2) 

		if useInters:
			plt.plot(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list([baselineIntraChrProb for i in x],toProb),'k-',label="Baseline intra-chromosomal")
			plt.plot(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list([baselineIntraChrProb for i in x],toProb),'b-',label="Baseline inter-chromosomal")
		plt.ylabel('Contact probability (x10$^{-5}$)',fontsize='large')
		plt.xlabel('Genomic distance (kb)',fontsize='large')
		if distLowThres>-1 and distUpThres>-1:
			plt.xlim(myUtils.scale_a_list([distLowThres, distUpThres],toKb))
		plt.gca().yaxis.set_major_locator( MaxNLocator(nbins = 3, prune=None))
		ax.legend(loc="upper right")

		ax = fig.add_subplot(2,1,2)

		plt.loglog(splineX,newSplineY,'g-')
		plt.errorbar(x, y, yerr=yerr, fmt='r.') # Data
		if useInters:
			plt.loglog(x,[baselineIntraChrProb for i in x],'k-')
			plt.loglog(x,[baselineIntraChrProb for i in x],'b-')
		if distLowThres>-1 and distUpThres>-1:
			plt.xlim([distLowThres, distUpThres])
		plt.ylabel('Contact probability (log-scale)',fontsize='large')
		plt.xlabel('Genomic distance (log-scale)',fontsize='large')

		plt.savefig(outdir+'/'+figname+'.png')

	# NOW write the calculated pvalues and corrected pvalues in a file 
	infile =gzip.open(infilename, 'r')
	intraInRangeCount=0
	intraOutOfRangeCount=0
	intraVeryProximalCount=0
	interCount=0
	sys.stderr.write("distLowThres " + repr(distLowThres) + "\tdistUpThres " + repr(distUpThres) +"\n")
	p_vals=[]
	q_vals=[]
	for line in infile:
		words=line.rstrip().split()
		interxn=myUtils.Interaction([words[0], int(words[1]), words[2], int(words[3])])
		interxn.setCount(int(words[4]))
		chr1=words[0]
		chr2=words[2]
		midPoint1=int(words[1])
		midPoint2=int(words[3])

		bias1=1.0; bias2=1.0;  # assumes there is no bias to begin with
		# if the biasDic is not null sets the real bias values
		if len(biasDic)>0:
			if chr1 in biasDic and midPoint1 in biasDic[chr1]:
				bias1=biasDic[chr1][midPoint1]
			if chr2 in biasDic and midPoint2 in biasDic[chr2]:
				bias2=biasDic[chr2][midPoint2]

		if (bias1<0 or bias2<0) and interxn.type!='inter':
			prior_p=1.0
			p_val=1.0
			p_vals.append(p_val)
		elif interxn.getType(distLowThres,distUpThres)=='intraInRange': 
			# make sure the interaction distance is covered by the probability bins
			distToLookUp=max(interxn.distance,min(x))
			distToLookUp=min(distToLookUp,max(x))
			i=min(bisect.bisect_left(splineX, distToLookUp),len(splineX)-1) 
			#prior_p=newSplineY[i]
			prior_p=newSplineY[i]*(bias1*bias2) # biases added in the picture
			intraInRangeCount +=1
			############# THIS HAS TO BE interactionCount-1 ##################
			p_val=scsp.bdtrc(interxn.hitCount-1,observedIntraInRangeSum,prior_p)
			p_vals.append(p_val)

		elif interxn.getType(distLowThres,distUpThres)=='intraShort':
			prior_p=1.0
			p_val=1.0
			intraVeryProximalCount +=1
			p_vals.append(p_val)

		elif interxn.getType(distLowThres,distUpThres)=='intraLong':
			# out of range bigger than distUpThres
			# use the prior of the baseline intra-chr interaction probability
			prior_p=1.0 #baselineIntraChrProb*(bias1*bias2)  # biases added in the picture
			p_val=scsp.bdtrc(interxn.hitCount-1,observedIntraAllSum,prior_p)
			intraOutOfRangeCount +=1
			p_vals.append(p_val)

		else:
			if useInters:
				#prior_p=baselineIntraChrProb
				prior_p=baselineInterChrProb*(bias1*bias2) # biases added in the picture
				############# THIS HAS TO BE interactionCount-1 ##################
				p_val=scsp.bdtrc(interxn.hitCount-1,observedInterAllSum,prior_p)
				interCount +=1
				p_vals.append(p_val)
	# END for
	infile.close()

	# Do the BH FDR correction 
	if useInters:
		q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount+possibleIntraAllCount)
		sys.stderr.write("possibleInterAllCount+possibleIntraAllCount " + repr(possibleInterAllCount+possibleIntraAllCount)+"\n")
	else:
		q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleIntraInRangeCount)
		sys.stderr.write("possibleIntraInRangeCount " + repr(possibleIntraInRangeCount)+"\n")

	infile =gzip.open(infilename, 'r')
	outfile =gzip.open(outdir+'/'+figname+'.significances.txt.gz', 'w')
	sys.stderr.write("Writing p-values to file %s" % figname + ".significances.txt.gz\n")
	count=0
	outfile.write("chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n")

	for line in infile:
		words=line.rstrip().split()
		chrNo1=words[0]
		midPoint1=int(words[1])
		chrNo2=words[2]
		midPoint2=int(words[3])
		interactionCount=int(words[4])
		p_val=p_vals[count]
		q_val=q_vals[count]
		
		if useInters==False and chrNo1==chrNo2: # intra
			interactionDistance=abs(midPoint1-midPoint2) # dist 
			if myUtils.in_range_check(interactionDistance,distLowThres,distUpThres):
				outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val))
		elif useInters==True and chrNo1!=chrNo2:
			outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val))
		#outfile.write("ALL\t%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val))

		count+=1
	# END for - printing pvals and qvals for all the interactions
	outfile.close()

	isOutlier=[]
	distsBelow=[]
	distsAbove=[]
	intcountsBelow=[]
	intcountsAbove=[]
	belowThresCount=0
	aboveThresCount=0
	outlierThres=1.0/possibleIntraInRangeCount
	for interactionDistance,interactionCount,bias12 in sortedInteractions:
		# make sure the interaction distance is covered by the probability bins
		distToLookUp=max(interactionDistance,min(x))
		distToLookUp=min(distToLookUp,max(x))
		i=min(bisect.bisect_left(splineX, distToLookUp),len(splineX)-1) 
		prior_p=newSplineY[i]*float(bias12) # biases added in the picture
		############# THIS HAS TO BE interactionCount-1 ##################
		p_val=scsp.bdtrc(interactionCount-1,observedIntraInRangeSum,prior_p)
		if p_val < outlierThres:
			distsBelow.append(interactionDistance)
			intcountsBelow.append(interactionCount)
			isOutlier.append(1)
			belowThresCount +=1
		else:
			distsAbove.append(interactionDistance)
			intcountsAbove.append(interactionCount)
			isOutlier.append(0)
			aboveThresCount +=1
	# END for - doing the outlier check for all interactions in sortedInteractions


	if visual==True:
		sys.stderr.write("Plotting results of extracting outliers to file %s" % figname + ".extractOutliers.png\n")
		plt.clf()
		fig = plt.figure()
		ax = fig.add_subplot(111)
		downsample=30 # for the non-outliers
		randIndcsAbove=sample([i for i in range(len(intcountsAbove))],len(intcountsAbove)/downsample)
		randIndcsAbove=sorted(randIndcsAbove)
		downsample=20 # for the outliers
		randIndcsBelow=sample([i for i in range(len(intcountsBelow))],len(intcountsBelow)/downsample)
		randIndcsBelow=sorted(randIndcsBelow)

		plt.plot(myUtils.scale_a_list([distsBelow[i] for i in randIndcsBelow],toKb),[intcountsBelow[i] for i in randIndcsBelow], 'r.',label="Outliers (p-value < 1/M)")
		plt.plot(myUtils.scale_a_list(splineX+[maxObservedGenomicDist],toKb),[newSplineY[i]*observedIntraInRangeSum	for i in range(len(newSplineY))]+[newSplineY[-1]*observedIntraInRangeSum], 'g-', label="spline-"+str(passNo)+" (x N)", linewidth=2.5)

		plt.xlabel('Genomic distance (kb)')
		plt.ylabel('Contact counts')
		print(repr(len(intcountsBelow))+"\t"),
		## this limits y-axis of the hit count plots
		if len(intcountsBelow)>0:
			plt.ylim([0,min(max(intcountsBelow),1500)])
		if distLowThres>-1 and distUpThres>-1:
			plt.xlim([0, distUpThres*toKb])
		ax.legend(loc="upper right",fancybox=True)
		plt.savefig(outdir+'/'+figname+'.extractOutliers.png')

	sys.stderr.write("intraInRangeCount " + repr(intraInRangeCount)+"\tintraOutOfRangeCount " +\
		repr(intraOutOfRangeCount)+"\tintraVeryProximalCount " + repr(intraVeryProximalCount) +"\tinterCount " + repr(interCount)+"\n")

	if visual==True:
		sys.stderr.write("Plotting q-values to file %s" % figname + ".qplot.png\n")
	minFDR=0.0
	maxFDR=0.05
	increment=0.001
	FDRx,FDRy=plot_qvalues(q_vals,minFDR,maxFDR,increment,figname+".qplot")

	infile.close()

	return [splineX, newSplineY, residual, isOutlier, FDRx, FDRy] # from fit_Spline