def fit_Spline(mainDic, x, y, yerr, infilename, outfilename, biasDic, outliersline, outliersdist, observedIntraInRangeSum, possibleIntraInRangeCount, possibleInterAllCount, observedIntraAllSum, observedInterAllSum, resolution, passNo): with open(logfile, 'a') as log: log.write("\nFitting a univariate spline to the probability means\n"), log.write( "------------------------------------------------------------------------------------\n" ), splineX = None newSplineY = None residual = None FDRx = None FDRy = None if not interOnly: if outliersdist != None: y = [f for _, f in sorted(zip(x, y), key=lambda pair: pair[0])] x.sort() for i in range(1, len(x)): if x[i] <= x[i - 1]: print( "ERROR in spline fitting. Distances do not decrease across bins. Ensure interaction file is correct." ) print("Avg. distance of bin(i-1)... %s" % x[i - 1]) print("Avg. distance of bin(i)... %s" % x[i]) sys.exit(2) # maximum residual allowed for spline is set to min(y)^2 splineError = min(y) * min(y) # use fitpack2 method -fit on the real x and y from equal occupancy binning ius = UnivariateSpline(x, y, s=splineError) tempMaxX = max(x) tempMinX = min(x) tempList = sorted([dis for dis in mainDic]) splineX = [] ### The below for loop will make sure nothing is out of range of [min(x) max(x)] ### Therefore everything will be within the range where the spline is defined for i in tempList: if tempMinX <= i <= tempMaxX: splineX.append(i) splineY = ius(splineX) #print(splineY) #print(yerr) ir = IsotonicRegression(increasing=False) newSplineY = ir.fit_transform(splineX, splineY) #print(newSplineY) residual = sum([i * i for i in (y - ius(x))]) if visual == True: xi = np.linspace(min(x), max(x), 5 * len(x)) yi = ius(xi) print("Plotting %s" % (outfilename + ".png")) plt.clf() fig = plt.figure() ax = fig.add_subplot(2, 1, 1) plt.plot(myUtils.scale_a_list(splineX, toKb), myUtils.scale_a_list(newSplineY, toProb), 'g-', label="spline-" + str(passNo), linewidth=2) plt.errorbar(myUtils.scale_a_list(x, toKb), myUtils.scale_a_list(y, toProb), myUtils.scale_a_list(yerr, toProb), fmt='r.', label="Mean with std. error", linewidth=2) #plt.ylabel('Contact probability (x10$^{-5}$)',fontsize='large') #plt.xlabel('Genomic distance (kb)',fontsize='large') plt.ylabel('Contact probability (x10$^{-5}$)') plt.xlabel('Genomic distance (kb)') if distLowThres > 0 and distUpThres < float("inf"): plt.xlim( myUtils.scale_a_list([distLowThres, distUpThres], toKb)) plt.gca().yaxis.set_major_locator(MaxNLocator(nbins=3, prune=None)) ax.legend(loc="upper right") ax = fig.add_subplot(2, 1, 2) plt.loglog(splineX, newSplineY, 'g-') plt.errorbar(x, y, yerr=yerr, fmt='r.') # Data if distLowThres > 0 and distUpThres < float("inf"): plt.xlim([distLowThres, distUpThres]) plt.ylabel('Contact probability (log-scale)') plt.xlabel('Genomic distance (log-scale)') plt.savefig(outfilename + '.png') # NOW write the calculated pvalues and corrected pvalues in a file infile = gzip.open(infilename, 'rt') intraInRangeCount = 0 intraOutOfRangeCount = 0 intraVeryProximalCount = 0 interCount = 0 discardCount = 0 p_vals = [] q_vals = [] biasl = [] biasr = [] for line in infile: ch1, mid1, ch2, mid2, contactCount = line.rstrip().split() contactCount = float(contactCount) interxn = myUtils.Interaction([ch1, int(mid1), ch2, int(mid2)]) interxn.setCount(contactCount) mid1 = int(mid1) mid2 = int(mid2) interactionType = interxn.getType(distLowThres, distUpThres) bias1 = 1.0 bias2 = 1.0 # assumes there is no bias to begin with # if the biasDic is not null sets the real bias values if biasDic: if ch1 in biasDic and mid1 in biasDic[ch1]: bias1 = biasDic[ch1][mid1] if ch2 in biasDic and mid2 in biasDic[ch2]: bias2 = biasDic[ch2][mid2] biasl.append(bias1) biasr.append(bias2) if (bias1 < 0 or bias2 < 0) and interactionType != 'inter': prior_p = 1.0 p_val = 1.0 discardCount += 1 elif interactionType == 'intraInRange' and not interOnly: distToLookUp = max(interxn.getDistance(), min(x)) distToLookUp = min(distToLookUp, max(x)) i = min(bisect.bisect_left(splineX, distToLookUp), len(splineX) - 1) prior_p = newSplineY[i] * (bias1 * bias2) p_val = scsp.bdtrc(interxn.getCount() - 1, observedIntraInRangeSum, prior_p) intraInRangeCount += 1 elif interactionType == 'intraShort' and not interOnly: prior_p = 1.0 p_val = 1.0 intraVeryProximalCount += 1 elif interactionType == 'intraLong' and not interOnly: prior_p = 1.0 #p_val=scsp.bdtrc(interxn.getCount()-1, observedIntraAllSum,prior_p) ##RUNBY p_val = 1.0 intraOutOfRangeCount += 1 else: if allReg or interOnly: prior_p = interChrProb * (bias1 * bias2) p_val = scsp.bdtrc(interxn.getCount() - 1, observedInterAllSum, prior_p) interCount += 1 else: p_val = 1.0 #p_vals.append(p_val) p_vals.append(p_val) infile.close() outlierThres = 0 # Do the BH FDR correction if allReg: outlierThres = 1.0 / (possibleIntraInRangeCount + possibleInterAllCount) q_vals = myStats.benjamini_hochberg_correction( p_vals, possibleInterAllCount + possibleIntraInRangeCount) elif interOnly and not allReg: outlierThres = 1.0 / possibleInterAllCount q_vals = myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount) else: outlierThres = 1.0 / possibleIntraInRangeCount q_vals = myStats.benjamini_hochberg_correction( p_vals, possibleIntraInRangeCount) print("Outlier threshold is... %s" % (outlierThres)) #now we write the values back to the file infile = gzip.open(infilename, 'rt') if resolution: outfile = gzip.open( outfilename + '.res' + str(resolution) + '.significances.txt.gz', 'wt') else: outfile = gzip.open(outfilename + '.significances.txt.gz', 'wt') print("Writing p-values and q-values to file %s" % (outfilename + ".significances.txt")) outfile.write( "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\tbias1\tbias2\n" ) count = 0 for line in infile: words = line.rstrip().split() chr1 = words[0] midPoint1 = int(words[1]) chr2 = words[2] midPoint2 = int(words[3]) interactionCount = float(words[4]) p_val = p_vals[count] q_val = q_vals[count] bias1 = biasl[count] bias2 = biasr[count] if (allReg or interOnly) and chr1 != chr2: outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" % (str(chr1), midPoint1, str(chr2), midPoint2, interactionCount, p_val, q_val, bias1, bias2)) if (allReg or not interOnly) and chr1 == chr2: interactionDistance = abs(midPoint1 - midPoint2) if myUtils.in_range_check(interactionDistance, distLowThres, distUpThres): outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" % (str(chr1), midPoint1, str(chr2), midPoint2, interactionCount, p_val, q_val, bias1, bias2)) if p_val < outlierThres: outliersline.add(count) outliersdist.add(abs(midPoint1 - midPoint2)) count += 1 outfile.close() infile.close() if visual == True: print("Plotting q-values to file %s" % outfilename + ".qplot.png") minFDR = 0.0 maxFDR = 0.05 increment = 0.001 FDRx, FDRy = plot_qvalues(q_vals, minFDR, maxFDR, increment, outfilename + ".qplot") with open(logfile, 'a') as log: log.write("Spline successfully fit\n"), log.write("\n"), log.write("\n"), return [ splineX, newSplineY, residual, outliersline, outliersdist, FDRx, FDRy ] # from fit_Spline
def generate_FragPairs(infilename): sys.stderr.write( "\nGenerating all possible intra-chromosomal fragment pairs and counting the number of all possible inter-chr fragment pairs\n" ) sys.stderr.write( "------------------------------------------------------------------------------------\n" ) global listOfMappableFrags # two dimensional list with all mappable fragment midpoints for each chr global chrList # list of all chromosomes (chrno (type=int)) global possiblePairsPerDistance # all possible intra-chr fragment pairs global possibleInterAllCount # count of all possible inter-chr fragment pairs global possibleIntraAllCount # count of all possible intra-chr fragment pairs global possibleIntraInRangeCount # count of all possible intra-chr fragment pairs in the range we're interested global baselineInterChrProb # 1 divided by all possible inter-chr fragment pairs global baselineIntraChrProb # 1 divided by all possible intra-chr fragment pairs listOfMappableFrags = [] chrList = [] #get the name of the first chr infile = gzip.open(infilename, 'r') line = infile.readline() words = line.rstrip().split() currChrNo = words[0] #get the name of first chr infile.close() # read the fragments file fragsPerChr = [ ] # temporary list that will be added to listOfMappableFrags for each chr totalNoOfFrags = 0 # total number of all mappable fragments infile = gzip.open(infilename, 'r') for line in infile: words = line.rstrip().split() chrNo = words[0] # can be an integer or a string #words[1] ignored midPoint = int(words[2]) hitCount = int(words[3]) # whenever the name of the chromosome changes if currChrNo != chrNo: listOfMappableFrags.append(fragsPerChr) totalNoOfFrags += len(fragsPerChr) chrList.append(currChrNo) currChrNo = chrNo fragsPerChr = [] # add the mappable midPoints to the temp fragsPerChr if hitCount >= mappabilityThreshold: fragsPerChr.append(midPoint) #END for # handle the last chromosome listOfMappableFrags.append(fragsPerChr) totalNoOfFrags += len(fragsPerChr) chrList.append(currChrNo) infile.close() # create all possible frag pairs possibleInterAllCount = 0 possibleIntraInRangeCount = 0 possibleIntraAllCount = 0 for i in chrList: countIntraPairs = 0 chrIndex = chrList.index( i) # get the index of chromosome from the chrList fragsPerChr = (listOfMappableFrags[chrIndex] )[:] # get the mappable midpoints for that chr tempLen = len(fragsPerChr) possibleInterAllCount += (totalNoOfFrags - tempLen) * tempLen # iterate over all possible intra-chr pairs to see which ones qualify as a 'possible' pair for x in range(tempLen): for y in range(x + 1, tempLen): interactionDistance = abs(fragsPerChr[x] - fragsPerChr[y]) if myUtils.in_range_check(interactionDistance, distLowThres, distUpThres): countIntraPairs += 1 dictkey = str(i) + '-' + str( min(fragsPerChr[x], fragsPerChr[y])) + '-' + str( max(fragsPerChr[x], fragsPerChr[y])) possiblePairsPerDistance[dictkey] = [ interactionDistance, 0, 1.0 ] # set count to zero for now and bias to 1.0 possibleIntraAllCount += 1 #END for #END for possibleIntraInRangeCount += countIntraPairs sys.stderr.write("Chromosome " +repr(i) +",\t"+str(tempLen) +" mappable fragments, \t"+str(countIntraPairs)\ +" possible intra-chr fragment pairs in range,\t" + str((totalNoOfFrags-tempLen)*tempLen) +" possible inter-chr fragment pairs\n") #END for # divide the possibleInterAllCount by 2 so that every inter-chr interaction is counted only once possibleInterAllCount = possibleInterAllCount / 2 sys.stderr.write("Total of \t"+str(possibleIntraInRangeCount) +" possible intra-chr fragment pairs in range,\t"\ +str(possibleIntraAllCount) +" possible intra-chr fragment pairs,\t"\ +str(possibleInterAllCount) +" possible inter-chr fragment pairs\n") # calculate inter-chr probabilities if possibleInterAllCount > 0: baselineInterChrProb = 1.0 / possibleInterAllCount baselineIntraChrProb = 1.0 / possibleIntraAllCount return # from generate_FragPairs
def generate_FragPairs(binStats, fragsfile, resolution): if resolution: with open(logfile, 'a') as log: log.write("Looping through all possible fragment pairs in-range\n") log.write( "------------------------------------------------------------------------------------\n" ), else: with open(logfile, 'a') as log: log.write("Enumerating all possible fragment pairs in-range\n") log.write( "------------------------------------------------------------------------------------\n" ), startT = time.time() minPossibleGenomicDist = float("inf") maxPossibleGenomicDist = 0 possibleIntraAllCount = 0 possibleInterAllCount = 0 possibleIntraInRangeCount = 0 interChrProb = 0 baselineIntraChrProb = 0 allFragsDic = {} with gzip.open(fragsfile, 'rt') as infile: for line in infile: words = line.split() currChr = words[0] currMid = int(words[2]) currHit = int(words[3]) if currChr not in allFragsDic: allFragsDic[currChr] = [] if currHit >= mappThres: allFragsDic[currChr].append(currMid) if resolution: noOfFrags = 0 maxFrags = {} for ch in allFragsDic: maxFrags[ch] = max( [int(i) - resolution / 2 for i in allFragsDic[ch]]) noOfFrags += len(allFragsDic[ch]) maxPossibleGenomicDist = max(maxPossibleGenomicDist, maxFrags[ch]) for ch in sorted(allFragsDic.keys()): maxFrag = maxFrags[ch] n = len(allFragsDic[ch]) d = 0 binTracker = 0 possibleIntraInRangeCountPerChr = 0 for intxnDistance in range(0, int(maxFrag + 1), resolution): npairs = n - d d += 1 if myUtils.in_range_check(intxnDistance, distLowThres, distUpThres): minPossibleGenomicDist = min(minPossibleGenomicDist, intxnDistance) possibleIntraInRangeCountPerChr += npairs else: continue currBin = binStats[binTracker] minOfBin = currBin[0][0] maxOfBin = currBin[0][1] while not (minOfBin <= intxnDistance <= maxOfBin): binTracker += 1 if binTracker not in binStats: binTracker -= 1 currBin = binStats[binTracker] minOfBin = currBin[0][0] maxOfBin = currBin[0][1] break else: currBin = binStats[binTracker] minOfBin = currBin[0][0] maxOfBin = currBin[0][1] currBin[7] += npairs currBin[1] += npairs currBin[3] += (float(intxnDistance / distScaling) * npairs) # possibleIntraInRangeCountPerChr += npairs possibleInterAllCount += n * (noOfFrags - n) possibleIntraAllCount += (n * (n + 1)) / 2 # n(n-1) if excluding self with open(logfile, 'a') as log: log.write("Chromosome " +repr(ch) +",\t"+str(n) +" mappable fragments, \t"+str(possibleIntraInRangeCountPerChr)\ +" possible intra-chr fragment pairs in range,\t" + str((noOfFrags-n)*n) +" possible inter-chr fragment pairs\n") possibleIntraInRangeCount += possibleIntraInRangeCountPerChr possibleInterAllCount /= 2 try: interChrProb = 1.0 / possibleInterAllCount except: interChrProb = 0 baselineIntraChrProb = 1.0 / possibleIntraAllCount else: noOfFrags = 0 for ch in allFragsDic: noOfFrags += len(allFragsDic[ch]) for ch in sorted(allFragsDic.keys()): countIntraPairs = 0 fragsPerChr = sorted(allFragsDic[ch]) templen = len(fragsPerChr) possibleInterAllCount += (noOfFrags - templen) * templen possibleIntraInRangeCountPerChr = 0 for x in range(templen): binTracker = 0 d = 0 for y in range(x + 1, templen): intxnDistance = abs( float(fragsPerChr[x]) - float(fragsPerChr[y])) if myUtils.in_range_check(intxnDistance, distLowThres, distUpThres): possibleIntraInRangeCountPerChr += 1 else: continue maxPossibleGenomicDist = max(maxPossibleGenomicDist, intxnDistance) minPossibleGenomicDist = min(minPossibleGenomicDist, intxnDistance) npairs = templen - d d += 1 currBin = binStats[binTracker] minOfBin = currBin[0][0] maxOfBin = currBin[0][1] while not (minOfBin <= intxnDistance <= maxOfBin): binTracker += 1 if binTracker not in binStats: binTracker -= 1 currBin = binStats[binTracker] minOfBin = currBin[0][0] maxOfBin = currBin[0][1] break else: currBin = binStats[binTracker] minOfBin = currBin[0][0] maxOfBin = currBin[0][1] currBin[7] += npairs currBin[1] += 1 currBin[3] += float(intxnDistance / distScaling) * npairs possibleIntraAllCount += 1 with open(logfile, 'a') as log: log.write("Chromosome " +repr(ch) +",\t"+str(templen) +" mappable fragments, \t"+str(possibleIntraInRangeCountPerChr)\ +" possible intra-chr fragment pairs in range,\t" + str((noOfFrags-templen)*templen) +" possible inter-chr fragment pairs\n") possibleIntraInRangeCount += possibleIntraInRangeCountPerChr possibleInterAllCount /= 2 try: interChrProb = 1.0 / possibleInterAllCount except: interChrProb = 0 baselineIntraChrProb = 1.0 / possibleIntraAllCount endT = time.time() print("Fragments file read. Time took %s" % (endT - startT)) with open(logfile, 'a') as log: log.write("Number of all fragments= %s\n" % (noOfFrags)) log.write("Possible, Intra-chr in range: pairs= %s \n" % (possibleIntraInRangeCount)) log.write("Possible, Intra-chr all: pairs= %s \n" % (possibleIntraAllCount)) log.write("Possible, Inter-chr all: pairs= %s \n" % (possibleInterAllCount)) log.write("Desired genomic distance range [%d %s] \n" % (distLowThres, distUpThres)), log.write("Range of possible genomic distances [%d %d] \n" % (minPossibleGenomicDist, maxPossibleGenomicDist)), log.write("Baseline intrachromosomal probability is %s \n" % (baselineIntraChrProb)), log.write("Interchromosomal probability is %s \n" % (interChrProb)), return (binStats, noOfFrags, maxPossibleGenomicDist, possibleIntraInRangeCount, possibleInterAllCount, interChrProb, baselineIntraChrProb) # return from generate_FragPairs
def generate_FragPairs(mainDic,infilename): # lowMappThres print("\nEnumerating all possible intra-chromosomal fragment pairs in-range\n"), print("------------------------------------------------------------------------------------\n"), global maxPossibleGenomicDist global possibleIntraAllCount global possibleInterAllCount global possibleIntraInRangeCount global interChrProb global baselineIntraChrProb #badFrags=[] allFragsDic={} #allFragsDicReverse={} infile=gzip.open(infilename,'r') indx=0 for line in infile: words=line.split() currChr=words[0]; currMid=words[1]; mapp=float(words[3]); if currChr not in allFragsDic: allFragsDic[currChr]={} allFragsDic[currChr][currMid]=indx # allFragsDicReverse[indx]=[currChr,currMid] #if mapp<=lowMappThres: # badFrags.append(indx) indx+=1 #END infile.close() noOfFrags=0 maxFrags={} for ch in allFragsDic: maxFrags[ch]=max([int(i)-resolution/2 for i in allFragsDic[ch]]) noOfFrags+=len(allFragsDic[ch]) maxPossibleGenomicDist=max(maxPossibleGenomicDist,maxFrags[ch]) #print badFrags for i in range(0,maxPossibleGenomicDist+1,resolution): mainDic[i]=[0,0] for ch in allFragsDic: maxFrag=maxFrags[ch] n=len(allFragsDic[ch]) d=0 for i in range(0,maxFrag+1,resolution): mainDic[i][0]+=n-d d+=1 # possibleInterAllCount+=n*(noOfFrags-n) possibleIntraAllCount+=(n*(n+1))/2 # n(n-1) if excluding self # possibleInterAllCount/=2 interChrProb=1.0/possibleInterAllCount baselineIntraChrProb=1.0/possibleIntraAllCount for i in range(0,maxPossibleGenomicDist+1,resolution): if myUtils.in_range_check(i,distLowThres,distUpThres): possibleIntraInRangeCount+=mainDic[i][0] #print str(i)+"\t"+str(mainDic[i][0]) print("Number of all fragments= "+str(noOfFrags)+"\t resolution= "+ str(resolution)) print("Possible, Intra-chr in range: pairs= "+str(possibleIntraInRangeCount)) print("Possible, Intra-chr all: pairs= "+str(possibleIntraAllCount)) print("Possible, Inter-chr all: pairs= "+str(possibleInterAllCount)) print("Desired genomic distance range [%d %d]" % (distLowThres,distUpThres) + "\n"), print("Range of possible genomic distances [0 %d]" % (maxPossibleGenomicDist) + "\n"), return (mainDic,noOfFrags) # return from generate_FragPairs
def fit_Spline(x, y, yerr, infilename, sortedInteractions, biasDic, figname, passNo): sys.stderr.write("\nFit a univariate spline to the probability means\n") sys.stderr.write( "------------------------------------------------------------------------------------\n" ) sys.stderr.write("baseline intra-chr probability: " + repr(baselineIntraChrProb) + "\tbaseline inter-chr probability: " + repr(baselineInterChrProb) + "\n") # xi and yi will be used only for visualization purposes # acutal fit and residual is all done on vectors x and y xi = np.linspace(min(x), max(x), overSample * len(x)) # assume residualFactor==-1: splineError = min(y) * min(y) # use fitpack2 method -fit on the real x and y from equal occupancy binning ius = UnivariateSpline(x, y, s=splineError) yi = ius(xi) #### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING ### NOW I DO THIS BY CALLING AN R function CALLED MONOREG ### This does the isotonic regression using option antitonic to make sure ### I get monotonically decreasing probabilites with increasion genomic distance tempMaxX = max(x) tempMinX = min(x) tempList = sorted(list(set([int(i[0]) for i in sortedInteractions]))) splineX = [] ### The below for loop will make sure nothing is out of range of [min(x) max(x)] ### Therefore everything will be within the range where the spline is defined for i in tempList: if tempMinX <= i and i <= tempMaxX: splineX.append(i) # END for #print len(splineX) splineY = ius(splineX) # R vector format rSplineX = ro.FloatVector(splineX) rSplineY = ro.FloatVector(splineY) rMonoReg = ro.r['monoreg'] # do the antitonic regression allRres = rMonoReg(rSplineX, rSplineY, type="antitonic") rNewSplineY = allRres[3] # convert data back to Python format newSplineY = [] diff = [] diffX = [] for i in range(len(rNewSplineY)): newSplineY.append(rNewSplineY[i]) if (splineY[i] - newSplineY[i]) > 0: diff.append(splineY[i] - newSplineY[i]) diffX.append(splineX[i]) # END for #print len(splineX) residual = sum([i * i for i in (y - ius(x))]) if visual == True: ### Now plot the results sys.stderr.write("Plotting %s" % figname + ".png\n") plt.clf() fig = plt.figure() ax = fig.add_subplot(2, 1, 1) plt.plot(myUtils.scale_a_list(splineX, toKb), myUtils.scale_a_list(newSplineY, toProb), 'g-', label="spline-" + str(passNo), linewidth=2) plt.errorbar(myUtils.scale_a_list(x, toKb), myUtils.scale_a_list(y, toProb), myUtils.scale_a_list(yerr, toProb), fmt='r.', label="Mean with std. error", linewidth=2) if useInters: plt.plot(myUtils.scale_a_list(x, toKb), myUtils.scale_a_list([baselineIntraChrProb for i in x], toProb), 'k-', label="Baseline intra-chromosomal") plt.plot(myUtils.scale_a_list(x, toKb), myUtils.scale_a_list([baselineIntraChrProb for i in x], toProb), 'b-', label="Baseline inter-chromosomal") plt.ylabel('Contact probability (x10$^{-5}$)', fontsize='large') plt.xlabel('Genomic distance (kb)', fontsize='large') if distLowThres > -1 and distUpThres > -1: plt.xlim(myUtils.scale_a_list([distLowThres, distUpThres], toKb)) plt.gca().yaxis.set_major_locator(MaxNLocator(nbins=3, prune=None)) ax.legend(loc="upper right") ax = fig.add_subplot(2, 1, 2) plt.loglog(splineX, newSplineY, 'g-') plt.errorbar(x, y, yerr=yerr, fmt='r.') # Data if useInters: plt.loglog(x, [baselineIntraChrProb for i in x], 'k-') plt.loglog(x, [baselineIntraChrProb for i in x], 'b-') if distLowThres > -1 and distUpThres > -1: plt.xlim([distLowThres, distUpThres]) plt.ylabel('Contact probability (log-scale)', fontsize='large') plt.xlabel('Genomic distance (log-scale)', fontsize='large') plt.savefig(outdir + '/' + figname + '.png') # NOW write the calculated pvalues and corrected pvalues in a file infile = gzip.open(infilename, 'r') intraInRangeCount = 0 intraOutOfRangeCount = 0 intraVeryProximalCount = 0 interCount = 0 sys.stderr.write("distLowThres " + repr(distLowThres) + "\tdistUpThres " + repr(distUpThres) + "\n") p_vals = [] q_vals = [] for line in infile: words = line.rstrip().split() interxn = myUtils.Interaction( [words[0], int(words[1]), words[2], int(words[3])]) interxn.setCount(int(words[4])) chr1 = words[0] chr2 = words[2] midPoint1 = int(words[1]) midPoint2 = int(words[3]) bias1 = 1.0 bias2 = 1.0 # assumes there is no bias to begin with # if the biasDic is not null sets the real bias values if len(biasDic) > 0: if chr1 in biasDic and midPoint1 in biasDic[chr1]: bias1 = biasDic[chr1][midPoint1] if chr2 in biasDic and midPoint2 in biasDic[chr2]: bias2 = biasDic[chr2][midPoint2] if (bias1 < 0 or bias2 < 0) and interxn.type != 'inter': prior_p = 1.0 p_val = 1.0 p_vals.append(p_val) elif interxn.getType(distLowThres, distUpThres) == 'intraInRange': # make sure the interaction distance is covered by the probability bins distToLookUp = max(interxn.distance, min(x)) distToLookUp = min(distToLookUp, max(x)) i = min(bisect.bisect_left(splineX, distToLookUp), len(splineX) - 1) #prior_p=newSplineY[i] prior_p = newSplineY[i] * (bias1 * bias2 ) # biases added in the picture intraInRangeCount += 1 ############# THIS HAS TO BE interactionCount-1 ################## p_val = scsp.bdtrc(interxn.hitCount - 1, observedIntraInRangeSum, prior_p) p_vals.append(p_val) elif interxn.getType(distLowThres, distUpThres) == 'intraShort': prior_p = 1.0 p_val = 1.0 intraVeryProximalCount += 1 p_vals.append(p_val) elif interxn.getType(distLowThres, distUpThres) == 'intraLong': # out of range bigger than distUpThres # use the prior of the baseline intra-chr interaction probability prior_p = 1.0 #baselineIntraChrProb*(bias1*bias2) # biases added in the picture p_val = scsp.bdtrc(interxn.hitCount - 1, observedIntraAllSum, prior_p) intraOutOfRangeCount += 1 p_vals.append(p_val) else: if useInters: #prior_p=baselineIntraChrProb prior_p = baselineInterChrProb * ( bias1 * bias2) # biases added in the picture ############# THIS HAS TO BE interactionCount-1 ################## p_val = scsp.bdtrc(interxn.hitCount - 1, observedInterAllSum, prior_p) interCount += 1 p_vals.append(p_val) # END for infile.close() # Do the BH FDR correction if useInters: q_vals = myStats.benjamini_hochberg_correction( p_vals, possibleInterAllCount + possibleIntraAllCount) sys.stderr.write("possibleInterAllCount+possibleIntraAllCount " + repr(possibleInterAllCount + possibleIntraAllCount) + "\n") else: q_vals = myStats.benjamini_hochberg_correction( p_vals, possibleIntraInRangeCount) sys.stderr.write("possibleIntraInRangeCount " + repr(possibleIntraInRangeCount) + "\n") infile = gzip.open(infilename, 'r') outfile = gzip.open(outdir + '/' + figname + '.significances.txt.gz', 'w') sys.stderr.write("Writing p-values to file %s" % figname + ".significances.txt.gz\n") count = 0 outfile.write( "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n" ) for line in infile: words = line.rstrip().split() chrNo1 = words[0] midPoint1 = int(words[1]) chrNo2 = words[2] midPoint2 = int(words[3]) interactionCount = int(words[4]) p_val = p_vals[count] q_val = q_vals[count] if useInters == False and chrNo1 == chrNo2: # intra interactionDistance = abs(midPoint1 - midPoint2) # dist if myUtils.in_range_check(interactionDistance, distLowThres, distUpThres): outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1), midPoint1, str(chrNo2), midPoint2, interactionCount, p_val, q_val)) elif useInters == True and chrNo1 != chrNo2: outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1), midPoint1, str(chrNo2), midPoint2, interactionCount, p_val, q_val)) #outfile.write("ALL\t%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val)) count += 1 # END for - printing pvals and qvals for all the interactions outfile.close() isOutlier = [] distsBelow = [] distsAbove = [] intcountsBelow = [] intcountsAbove = [] belowThresCount = 0 aboveThresCount = 0 outlierThres = 1.0 / possibleIntraInRangeCount for interactionDistance, interactionCount, bias12 in sortedInteractions: # make sure the interaction distance is covered by the probability bins distToLookUp = max(interactionDistance, min(x)) distToLookUp = min(distToLookUp, max(x)) i = min(bisect.bisect_left(splineX, distToLookUp), len(splineX) - 1) prior_p = newSplineY[i] * float(bias12) # biases added in the picture ############# THIS HAS TO BE interactionCount-1 ################## p_val = scsp.bdtrc(interactionCount - 1, observedIntraInRangeSum, prior_p) if p_val < outlierThres: distsBelow.append(interactionDistance) intcountsBelow.append(interactionCount) isOutlier.append(1) belowThresCount += 1 else: distsAbove.append(interactionDistance) intcountsAbove.append(interactionCount) isOutlier.append(0) aboveThresCount += 1 # END for - doing the outlier check for all interactions in sortedInteractions if visual == True: sys.stderr.write("Plotting results of extracting outliers to file %s" % figname + ".extractOutliers.png\n") plt.clf() fig = plt.figure() ax = fig.add_subplot(111) downsample = 30 # for the non-outliers randIndcsAbove = sample([i for i in range(len(intcountsAbove))], len(intcountsAbove) / downsample) randIndcsAbove = sorted(randIndcsAbove) downsample = 20 # for the outliers randIndcsBelow = sample([i for i in range(len(intcountsBelow))], len(intcountsBelow) / downsample) randIndcsBelow = sorted(randIndcsBelow) plt.plot(myUtils.scale_a_list([distsBelow[i] for i in randIndcsBelow], toKb), [intcountsBelow[i] for i in randIndcsBelow], 'r.', label="Outliers (p-value < 1/M)") plt.plot(myUtils.scale_a_list(splineX + [maxObservedGenomicDist], toKb), [ newSplineY[i] * observedIntraInRangeSum for i in range(len(newSplineY)) ] + [newSplineY[-1] * observedIntraInRangeSum], 'g-', label="spline-" + str(passNo) + " (x N)", linewidth=2.5) plt.xlabel('Genomic distance (kb)') plt.ylabel('Contact counts') print(repr(len(intcountsBelow)) + "\t"), ## this limits y-axis of the hit count plots if len(intcountsBelow) > 0: plt.ylim([0, min(max(intcountsBelow), 1500)]) if distLowThres > -1 and distUpThres > -1: plt.xlim([0, distUpThres * toKb]) ax.legend(loc="upper right", fancybox=True) plt.savefig(outdir + '/' + figname + '.extractOutliers.png') sys.stderr.write("intraInRangeCount " + repr(intraInRangeCount)+"\tintraOutOfRangeCount " +\ repr(intraOutOfRangeCount)+"\tintraVeryProximalCount " + repr(intraVeryProximalCount) +"\tinterCount " + repr(interCount)+"\n") if visual == True: sys.stderr.write("Plotting q-values to file %s" % figname + ".qplot.png\n") minFDR = 0.0 maxFDR = 0.05 increment = 0.001 FDRx, FDRy = plot_qvalues(q_vals, minFDR, maxFDR, increment, figname + ".qplot") infile.close() return [splineX, newSplineY, residual, isOutlier, FDRx, FDRy] # from fit_Spline
def generate_FragPairs(mainDic, infilename): # lowMappThres print( "\nEnumerating all possible intra-chromosomal fragment pairs in-range\n" ), print( "------------------------------------------------------------------------------------\n" ), global maxPossibleGenomicDist global possibleIntraAllCount global possibleInterAllCount global possibleIntraInRangeCount global interChrProb global baselineIntraChrProb #badFrags=[] allFragsDic = {} #allFragsDicReverse={} infile = gzip.open(infilename, 'r') indx = 0 for line in infile: words = line.split() currChr = words[0] currMid = words[1] mapp = float(words[3]) if currChr not in allFragsDic: allFragsDic[currChr] = {} allFragsDic[currChr][currMid] = indx # allFragsDicReverse[indx]=[currChr,currMid] #if mapp<=lowMappThres: # badFrags.append(indx) indx += 1 #END infile.close() noOfFrags = 0 maxFrags = {} for ch in allFragsDic: maxFrags[ch] = max([int(i) - resolution / 2 for i in allFragsDic[ch]]) noOfFrags += len(allFragsDic[ch]) maxPossibleGenomicDist = max(maxPossibleGenomicDist, maxFrags[ch]) #print badFrags for i in range(0, maxPossibleGenomicDist + 1, resolution): mainDic[i] = [0, 0] for ch in allFragsDic: maxFrag = maxFrags[ch] n = len(allFragsDic[ch]) d = 0 for i in range(0, maxFrag + 1, resolution): mainDic[i][0] += n - d d += 1 # possibleInterAllCount += n * (noOfFrags - n) possibleIntraAllCount += (n * (n + 1)) / 2 # n(n-1) if excluding self # possibleInterAllCount /= 2 interChrProb = 1.0 / possibleInterAllCount baselineIntraChrProb = 1.0 / possibleIntraAllCount for i in range(0, maxPossibleGenomicDist + 1, resolution): if myUtils.in_range_check(i, distLowThres, distUpThres): possibleIntraInRangeCount += mainDic[i][0] #print str(i)+"\t"+str(mainDic[i][0]) print("Number of all fragments= " + str(noOfFrags) + "\t resolution= " + str(resolution)) print("Possible, Intra-chr in range: pairs= " + str(possibleIntraInRangeCount)) print("Possible, Intra-chr all: pairs= " + str(possibleIntraAllCount)) print("Possible, Inter-chr all: pairs= " + str(possibleInterAllCount)) print( "Desired genomic distance range [%d %d]" % (distLowThres, distUpThres) + "\n"), print( "Range of possible genomic distances [0 %d]" % (maxPossibleGenomicDist) + "\n"), return (mainDic, noOfFrags) # return from generate_FragPairs
def calculate_Probabilities(mainDic,outfilename): print("\nCalculating probability means and standard deviations by equal-occupancy binning of contact counts\n"), print("------------------------------------------------------------------------------------\n"), outfile=open(outfilename+'.res'+str(resolution)+'.txt', 'w') ## total interaction count to put on top of the plot #totalInteractionCountForPlot=0 #for i in range(0,maxPossibleGenomicDist+1,resolution): # totalInteractionCountForPlot += mainDic[i][1] #totalInteractionCountForPlot/=2 desiredPerBin=(observedIntraInRangeSum)/noOfBins print("observed intra-chr read counts in range\t"+repr(observedIntraInRangeSum)+ ",\tdesired number of contacts per bin\t" +repr(desiredPerBin)+",\tnumber of bins\t"+repr(noOfBins)+"\n"), # the following five lists will be the print outputs x=[] # avg genomic distances of bins y=[] # avg interaction probabilities of bins yerr=[] # stderrs of bins pairCounts=[] # number of pairs in bins interactionTotals=[] # number of interactions (reads) in bins interactionTotalForBinTermination=0 n=0 # bin counter so far totalInteractionCountSoFar=0 #observedIntraInRangeSum distsToGoInAbin=[] binFull=0 for i in range(0,maxPossibleGenomicDist+1,resolution): totalInteractionCountSoFar+=mainDic[i][1] if myUtils.in_range_check(i,distLowThres,distUpThres)==False: continue # if one distance has more than necessary counts to fill a bin if mainDic[i][1]>=desiredPerBin: distsToGoInAbin.append(i) interactionTotalForBinTermination=0 binFull=1 # if adding the next bin will fill the bin elif interactionTotalForBinTermination+mainDic[i][1] >= desiredPerBin: distsToGoInAbin.append(i) interactionTotalForBinTermination=0 binFull=1 # if adding the next bin will fill the bin else: distsToGoInAbin.append(i) interactionTotalForBinTermination+=mainDic[i][1] # if binFull==1: noOfPairsForBin=0 interactionTotalForBin=0 avgDistance=0 # dynamically update the desiredPerBin after each bin is full n+=1 if n<noOfBins: desiredPerBin=1.0*(observedIntraInRangeSum-totalInteractionCountSoFar)/(noOfBins-n) se_p=0 # for now I'm not worrying about error etc. for b in distsToGoInAbin: noOfPairsForBin+=mainDic[b][0] interactionTotalForBin+=mainDic[b][1] avgDistance+=1.0*mainDic[b][0]*(b/distScaling) # meanProbabilityObsv=(1.0*interactionTotalForBin/noOfPairsForBin)/observedIntraInRangeSum avgDistance=distScaling*(avgDistance/noOfPairsForBin) # append this bin x.append(float(avgDistance)) y.append(float(meanProbabilityObsv)) yerr.append(float(se_p)) pairCounts.append(noOfPairsForBin) interactionTotals.append(interactionTotalForBin) print "%d" % n+ "\t" + "%f" % avgDistance + "\t"+"%.2e" % meanProbabilityObsv + "\t"\ + "%.2e" % se_p +"\t" +"%d" % noOfPairsForBin +"\t" +"%d" % interactionTotalForBin # reset counts interactionTotalForBinTermination=0 binFull=0 distsToGoInAbin=[] # END if # END for print("Writing equal-occupancy binning results to %s" % outfilename + ".txt\n"), outfile.write("avgGenomicDist\tcontactProbability\tstandardError\tnoOfLocusPairs\ttotalOfContactCounts\n") for i in range(len(x)): outfile.write("%d" % x[i] + "\t"+"%.2e" % y[i]+ "\t" + "%.2e" % yerr[i] + "\t" +"%d" % pairCounts[i] + "\t" +"%d" % interactionTotals[i]+"\n") outfile.close() return [x,y,yerr] # from calculate_Probabilities
def calculate_Probabilities(mainDic, outfilename): print( "\nCalculating probability means and standard deviations by equal-occupancy binning of contact counts\n" ), print( "------------------------------------------------------------------------------------\n" ), outfile = open(outfilename + '.res' + str(resolution) + '.txt', 'w') ## total interaction count to put on top of the plot #totalInteractionCountForPlot=0 #for i in range(0,maxPossibleGenomicDist+1,resolution): # totalInteractionCountForPlot += mainDic[i][1] #totalInteractionCountForPlot/=2 desiredPerBin = (observedIntraInRangeSum) / noOfBins print("observed intra-chr read counts in range\t" + repr(observedIntraInRangeSum) + ",\tdesired number of contacts per bin\t" + repr(desiredPerBin) + ",\tnumber of bins\t" + repr(noOfBins) + "\n"), # the following five lists will be the print outputs x = [] # avg genomic distances of bins y = [] # avg interaction probabilities of bins yerr = [] # stderrs of bins pairCounts = [] # number of pairs in bins interactionTotals = [] # number of interactions (reads) in bins interactionTotalForBinTermination = 0 n = 0 # bin counter so far totalInteractionCountSoFar = 0 #observedIntraInRangeSum distsToGoInAbin = [] binFull = 0 for i in range(0, maxPossibleGenomicDist + 1, resolution): totalInteractionCountSoFar += mainDic[i][1] if myUtils.in_range_check(i, distLowThres, distUpThres) == False: continue # if one distance has more than necessary counts to fill a bin if mainDic[i][1] >= desiredPerBin: distsToGoInAbin.append(i) interactionTotalForBinTermination = 0 binFull = 1 # if adding the next bin will fill the bin elif interactionTotalForBinTermination + mainDic[i][1] >= desiredPerBin: distsToGoInAbin.append(i) interactionTotalForBinTermination = 0 binFull = 1 # if adding the next bin will fill the bin else: distsToGoInAbin.append(i) interactionTotalForBinTermination += mainDic[i][1] # if binFull == 1: noOfPairsForBin = 0 interactionTotalForBin = 0 avgDistance = 0 # dynamically update the desiredPerBin after each bin is full n += 1 if n < noOfBins: desiredPerBin = 1.0 * (observedIntraInRangeSum - totalInteractionCountSoFar) / ( noOfBins - n) se_p = 0 # for now I'm not worrying about error etc. for b in distsToGoInAbin: noOfPairsForBin += mainDic[b][0] interactionTotalForBin += mainDic[b][1] avgDistance += 1.0 * mainDic[b][0] * (b / distScaling) # meanProbabilityObsv = (1.0 * interactionTotalForBin / noOfPairsForBin) / observedIntraInRangeSum avgDistance = distScaling * (avgDistance / noOfPairsForBin) # append this bin x.append(float(avgDistance)) y.append(float(meanProbabilityObsv)) yerr.append(float(se_p)) pairCounts.append(noOfPairsForBin) interactionTotals.append(interactionTotalForBin) print "%d" % n+ "\t" + "%f" % avgDistance + "\t"+"%.2e" % meanProbabilityObsv + "\t"\ + "%.2e" % se_p +"\t" +"%d" % noOfPairsForBin +"\t" +"%d" % interactionTotalForBin # reset counts interactionTotalForBinTermination = 0 binFull = 0 distsToGoInAbin = [] # END if # END for print("Writing equal-occupancy binning results to %s" % outfilename + ".txt\n"), outfile.write( "avgGenomicDist\tcontactProbability\tstandardError\tnoOfLocusPairs\ttotalOfContactCounts\n" ) for i in range(len(x)): outfile.write("%d" % x[i] + "\t" + "%.2e" % y[i] + "\t" + "%.2e" % yerr[i] + "\t" + "%d" % pairCounts[i] + "\t" + "%d" % interactionTotals[i] + "\n") outfile.close() return [x, y, yerr] # from calculate_Probabilities
def fit_Spline(mainDic,x,y,yerr,infilename,outfilename,biasDic,outliersline,outliersdist,observedIntraInRangeSum, possibleIntraInRangeCount, possibleInterAllCount, observedIntraAllSum, observedInterAllSum, resolution, passNo): with open(logfile, 'a') as log: log.write("\nFitting a univariate spline to the probability means\n"), log.write("------------------------------------------------------------------------------------\n"), splineX = None newSplineY = None residual = None FDRx = None FDRy = None if not interOnly: if outliersdist != None: y = [f for _, f in sorted(zip(x,y), key=lambda pair: pair[0])] x.sort() for i in range(1,len(x)): if x[i]<=x[i-1]: print("ERROR in spline fitting. Distances do not decrease across bins. Ensure interaction file is correct.") print("Avg. distance of bin(i-1)... %s" % x[i-1]) print("Avg. distance of bin(i)... %s" % x[i]) sys.exit(2) # maximum residual allowed for spline is set to min(y)^2 splineError=min(y)*min(y) # use fitpack2 method -fit on the real x and y from equal occupancy binning ius = UnivariateSpline(x, y, s=splineError) tempMaxX=max(x) tempMinX=min(x) tempList=sorted([dis for dis in mainDic]) splineX=[] ### The below for loop will make sure nothing is out of range of [min(x) max(x)] ### Therefore everything will be within the range where the spline is defined for i in tempList: if tempMinX<=i<=tempMaxX: splineX.append(i) splineY=ius(splineX) #print(splineY) #print(yerr) ir = IsotonicRegression(increasing=False) newSplineY = ir.fit_transform(splineX,splineY) #print(newSplineY) residual =sum([i*i for i in (y - ius(x))]) if visual==True: xi = np.linspace(min(x),max(x),5*len(x)) yi = ius(xi) print("Plotting %s" % (outfilename + ".png")) plt.clf() fig = plt.figure() ax = fig.add_subplot(2,1,1) plt.plot(myUtils.scale_a_list(splineX,toKb), myUtils.scale_a_list(newSplineY,toProb),'g-',label="spline-"+str(passNo),linewidth=2) plt.errorbar(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list(y,toProb),myUtils.scale_a_list(yerr,toProb),fmt='r.',label="Mean with std. error",linewidth=2) #plt.ylabel('Contact probability (x10$^{-5}$)',fontsize='large') #plt.xlabel('Genomic distance (kb)',fontsize='large') plt.ylabel('Contact probability (x10$^{-5}$)') plt.xlabel('Genomic distance (kb)') if distLowThres>0 and distUpThres<float("inf"): plt.xlim(myUtils.scale_a_list([distLowThres, distUpThres],toKb)) plt.gca().yaxis.set_major_locator( MaxNLocator(nbins = 3, prune=None)) ax.legend(loc="upper right") ax = fig.add_subplot(2,1,2) plt.loglog(splineX,newSplineY,'g-') plt.errorbar(x, y, yerr=yerr, fmt='r.') # Data if distLowThres>0 and distUpThres<float("inf"): plt.xlim([distLowThres, distUpThres]) plt.ylabel('Contact probability (log-scale)') plt.xlabel('Genomic distance (log-scale)') plt.savefig(outfilename+'.png') # NOW write the calculated pvalues and corrected pvalues in a file infile = gzip.open(infilename, 'rt') intraInRangeCount=0 intraOutOfRangeCount=0 intraVeryProximalCount=0 interCount=0 discardCount=0 p_vals=[] q_vals=[] biasl=[] biasr=[] for line in infile: ch1,mid1,ch2,mid2,contactCount=line.rstrip().split() contactCount = float(contactCount) interxn=myUtils.Interaction([ch1, int(mid1), ch2, int(mid2)]) interxn.setCount(contactCount) mid1 = int(mid1); mid2 = int(mid2) interactionType = interxn.getType(distLowThres,distUpThres) bias1=1.0; bias2=1.0; # assumes there is no bias to begin with # if the biasDic is not null sets the real bias values if biasDic: if ch1 not in biasDic: print("Warning. Bias file does not contain chromosome %s. \ Please ensure you're using correct file. Fit-Hi-C will continue with\ bias = -1 for this locus" % ch1) bias1 = -1 else: if mid1 not in biasDic[ch1]: print("Error. Bias file does not contain midpoint %s within \ %s. Please ensure you're using the correct file and/or resolution \ argument. Fit-Hi-C will continue with bias = -1 for this locus" \ % (mid1, ch1)) bias1 = -1 else: bias1=biasDic[ch1][mid1] if ch2 not in biasDic: print("Warning. Bias file does not contain chromosome %s. \ Please ensure you're using correct file. Fit-Hi-C will continue with\ bias = -1 for this locus" % ch2) bias2 = -1 else: if mid2 not in biasDic[ch2]: print("Error. Bias file does not contain midpoint %s within \ %s. Please ensure you're using the correct file and/or resolution \ argument. Fit-Hi-C will continue with bias = -1 for this locus" \ % (mid2, ch2)) bias2 = -1 else: bias2=biasDic[ch2][mid2] biasl.append(bias1) biasr.append(bias2) if (bias1<0 or bias2<0) and interactionType !='inter': prior_p=1.0 p_val=1.0 discardCount+=1 elif interactionType=='intraInRange' and not interOnly: distToLookUp=max(interxn.getDistance(),min(x)) distToLookUp=min(distToLookUp,max(x)) i=min(bisect.bisect_left(splineX, distToLookUp),len(splineX)-1) prior_p=newSplineY[i]*(bias1*bias2) p_val=scsp.bdtrc(interxn.getCount()-1,observedIntraInRangeSum,prior_p) intraInRangeCount +=1 elif interactionType =='intraShort' and not interOnly: prior_p=1.0 p_val=1.0 intraVeryProximalCount += 1 elif interactionType =='intraLong' and not interOnly: prior_p=1.0 #p_val=scsp.bdtrc(interxn.getCount()-1, observedIntraAllSum,prior_p) ##RUNBY p_val=1.0 intraOutOfRangeCount += 1 else: if allReg or interOnly: prior_p=interChrProb*(bias1*bias2) p_val=scsp.bdtrc(interxn.getCount()-1,observedInterAllSum,prior_p) interCount += 1 else: p_val=1.0 #p_vals.append(p_val) p_vals.append(p_val) infile.close() outlierThres = 0 # Do the BH FDR correction if allReg: outlierThres=1.0/(possibleIntraInRangeCount+possibleInterAllCount) q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount+possibleIntraInRangeCount) elif interOnly and not allReg: outlierThres = 1.0/possibleInterAllCount q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount) else: outlierThres = 1.0/possibleIntraInRangeCount q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleIntraInRangeCount) print("Outlier threshold is... %s" % (outlierThres)) #now we write the values back to the file infile =gzip.open(infilename, 'rt') if resolution: outfile =gzip.open(outfilename+'.res'+str(resolution)+'.significances.txt.gz', 'wt') else: outfile =gzip.open(outfilename+'.significances.txt.gz', 'wt') print("Writing p-values and q-values to file %s" % (outfilename + ".significances.txt")) outfile.write("chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\tbias1\tbias2\n") count=0 for line in infile: words=line.rstrip().split() chr1=words[0] midPoint1=int(words[1]) chr2=words[2] midPoint2=int(words[3]) interactionCount=float(words[4]) p_val=p_vals[count] q_val=q_vals[count] bias1=biasl[count] bias2=biasr[count] if (allReg or interOnly) and chr1!=chr2: outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" % (str(chr1), midPoint1, str(chr2), midPoint2, interactionCount, p_val, q_val, bias1, bias2)) if (allReg or not interOnly) and chr1==chr2: interactionDistance = abs(midPoint1-midPoint2) if myUtils.in_range_check(interactionDistance,distLowThres, distUpThres): outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" % (str(chr1), midPoint1, str(chr2), midPoint2, interactionCount, p_val, q_val, bias1, bias2)) if p_val<outlierThres: outliersline.add(count) outliersdist.add(abs(midPoint1-midPoint2)) count+=1 outfile.close() infile.close() if visual == True: print("Plotting q-values to file %s" % outfilename + ".qplot.png") minFDR=0.0 maxFDR=0.05 increment=0.001 FDRx,FDRy=plot_qvalues(q_vals,minFDR,maxFDR,increment,outfilename+".qplot") with open(logfile, 'a') as log: log.write("Spline successfully fit\n"), log.write("\n"), log.write("\n"), return [splineX, newSplineY, residual, outliersline, outliersdist, FDRx, FDRy] # from fit_Spline
def generate_FragPairs(binStats, fragsfile, resolution): if resolution: with open(logfile, 'a') as log: log.write("Looping through all possible fragment pairs in-range\n") log.write("------------------------------------------------------------------------------------\n"), else: with open(logfile, 'a') as log: log.write("Enumerating all possible fragment pairs in-range\n") log.write("------------------------------------------------------------------------------------\n"), startT = time.time() minPossibleGenomicDist = float("inf") maxPossibleGenomicDist = 0 possibleIntraAllCount = 0 possibleInterAllCount = 0 possibleIntraInRangeCount = 0 interChrProb = 0 baselineIntraChrProb = 0 allFragsDic={} with gzip.open(fragsfile,'rt') as infile: for line in infile: words=line.split() currChr=words[0] currMid=int(words[2]) currHit=int(words[3]) if currChr not in allFragsDic: allFragsDic[currChr]=[] if currHit>=mappThres: allFragsDic[currChr].append(currMid) if resolution: noOfFrags=0 maxFrags={} for ch in allFragsDic: maxFrags[ch]=max([int(i)-resolution/2 for i in allFragsDic[ch]]) noOfFrags+=len(allFragsDic[ch]) maxPossibleGenomicDist=max(maxPossibleGenomicDist,maxFrags[ch]) for ch in sorted(allFragsDic.keys()): maxFrag=maxFrags[ch] n=len(allFragsDic[ch]) d=0 binTracker = 0 possibleIntraInRangeCountPerChr = 0 for intxnDistance in range(0,int(maxFrag+1),resolution): npairs = n-d d+=1 if myUtils.in_range_check(intxnDistance,distLowThres,distUpThres): minPossibleGenomicDist = min(minPossibleGenomicDist, intxnDistance) possibleIntraInRangeCountPerChr += npairs else: continue currBin = binStats[binTracker] minOfBin = currBin[0][0] maxOfBin = currBin[0][1] while not (minOfBin<=intxnDistance<=maxOfBin): binTracker += 1 if binTracker not in binStats: binTracker-=1 currBin = binStats[binTracker] minOfBin = currBin[0][0] maxOfBin = currBin[0][1] break else: currBin = binStats[binTracker] minOfBin = currBin[0][0] maxOfBin = currBin[0][1] currBin[7]+=npairs currBin[1]+=npairs currBin[3]+=(float(intxnDistance/distScaling)*npairs) # possibleIntraInRangeCountPerChr += npairs possibleInterAllCount+=n*(noOfFrags-n) possibleIntraAllCount+=(n*(n+1))/2 # n(n-1) if excluding self with open(logfile, 'a') as log: log.write("Chromosome " +repr(ch) +",\t"+str(n) +" mappable fragments, \t"+str(possibleIntraInRangeCountPerChr)\ +" possible intra-chr fragment pairs in range,\t" + str((noOfFrags-n)*n) +" possible inter-chr fragment pairs\n") possibleIntraInRangeCount += possibleIntraInRangeCountPerChr possibleInterAllCount/=2 try: interChrProb=1.0/possibleInterAllCount except: interChrProb = 0 baselineIntraChrProb=1.0/possibleIntraAllCount else: noOfFrags = 0 for ch in allFragsDic: noOfFrags += len(allFragsDic[ch]) for ch in sorted(allFragsDic.keys()): countIntraPairs = 0 fragsPerChr = sorted(allFragsDic[ch]) templen = len(fragsPerChr) possibleInterAllCount += (noOfFrags-templen)*templen possibleIntraInRangeCountPerChr = 0 for x in range(templen): binTracker = 0 d = 0 for y in range(x+1,templen): intxnDistance = abs(float(fragsPerChr[x])-float(fragsPerChr[y])) if myUtils.in_range_check(intxnDistance, distLowThres,distUpThres): possibleIntraInRangeCountPerChr += 1 else: continue maxPossibleGenomicDist = max(maxPossibleGenomicDist, intxnDistance) minPossibleGenomicDist = min(minPossibleGenomicDist, intxnDistance) npairs = templen-d d+=1 currBin = binStats[binTracker] minOfBin = currBin[0][0] maxOfBin = currBin[0][1] while not (minOfBin<=intxnDistance<=maxOfBin): binTracker += 1 if binTracker not in binStats: binTracker-=1 currBin = binStats[binTracker] minOfBin = currBin[0][0] maxOfBin = currBin[0][1] break else: currBin = binStats[binTracker] minOfBin = currBin[0][0] maxOfBin = currBin[0][1] currBin[7]+=npairs currBin[1]+=1 currBin[3]+=float(intxnDistance/distScaling)*npairs possibleIntraAllCount += 1 with open(logfile, 'a') as log: log.write("Chromosome " +repr(ch) +",\t"+str(templen) +" mappable fragments, \t"+str(possibleIntraInRangeCountPerChr)\ +" possible intra-chr fragment pairs in range,\t" + str((noOfFrags-templen)*templen) +" possible inter-chr fragment pairs\n") possibleIntraInRangeCount += possibleIntraInRangeCountPerChr possibleInterAllCount/=2 try: interChrProb=1.0/possibleInterAllCount except: interChrProb = 0 baselineIntraChrProb=1.0/possibleIntraAllCount endT = time.time() print("Fragments file read. Time took %s" % (endT-startT)) with open(logfile, 'a') as log: log.write("Number of all fragments= %s\n" % (noOfFrags)) log.write("Possible, Intra-chr in range: pairs= %s \n" % (possibleIntraInRangeCount)) log.write("Possible, Intra-chr all: pairs= %s \n" % (possibleIntraAllCount)) log.write("Possible, Inter-chr all: pairs= %s \n" % (possibleInterAllCount)) log.write("Desired genomic distance range [%d %s] \n" % (distLowThres,distUpThres)), log.write("Range of possible genomic distances [%d %d] \n" % (minPossibleGenomicDist, maxPossibleGenomicDist)), log.write("Baseline intrachromosomal probability is %s \n" % (baselineIntraChrProb)), log.write("Interchromosomal probability is %s \n" % (interChrProb)), return (binStats,noOfFrags, maxPossibleGenomicDist, possibleIntraInRangeCount, possibleInterAllCount, interChrProb, baselineIntraChrProb) # return from generate_FragPairs
def generate_FragPairs(infilename): sys.stderr.write("\nGenerating all possible intra-chromosomal fragment pairs and counting the number of all possible inter-chr fragment pairs\n") sys.stderr.write("------------------------------------------------------------------------------------\n") global listOfMappableFrags # two dimensional list with all mappable fragment midpoints for each chr global chrList # list of all chromosomes (chrno (type=int)) global possiblePairsPerDistance # all possible intra-chr fragment pairs global possibleInterAllCount # count of all possible inter-chr fragment pairs global possibleIntraAllCount # count of all possible intra-chr fragment pairs global possibleIntraInRangeCount # count of all possible intra-chr fragment pairs in the range we're interested global baselineInterChrProb # 1 divided by all possible inter-chr fragment pairs global baselineIntraChrProb # 1 divided by all possible intra-chr fragment pairs listOfMappableFrags=[] chrList=[] #get the name of the first chr infile =gzip.open(infilename, 'r') line=infile.readline() words=line.rstrip().split() currChrNo=words[0] #get the name of first chr infile.close() # read the fragments file fragsPerChr=[] # temporary list that will be added to listOfMappableFrags for each chr totalNoOfFrags=0 # total number of all mappable fragments infile =gzip.open(infilename, 'r') for line in infile: words=line.rstrip().split() chrNo=words[0] # can be an integer or a string #words[1] ignored midPoint=int(words[2]) hitCount=int(words[3]) # whenever the name of the chromosome changes if currChrNo!=chrNo: listOfMappableFrags.append(fragsPerChr) totalNoOfFrags += len(fragsPerChr) chrList.append(currChrNo) currChrNo = chrNo fragsPerChr=[] # add the mappable midPoints to the temp fragsPerChr if hitCount >= mappabilityThreshold: fragsPerChr.append(midPoint) #END for # handle the last chromosome listOfMappableFrags.append(fragsPerChr) totalNoOfFrags += len(fragsPerChr) chrList.append(currChrNo) infile.close() # create all possible frag pairs possibleInterAllCount=0 possibleIntraInRangeCount=0 possibleIntraAllCount=0 for i in chrList: countIntraPairs=0 chrIndex=chrList.index(i) # get the index of chromosome from the chrList fragsPerChr=(listOfMappableFrags[chrIndex])[:] # get the mappable midpoints for that chr tempLen=len(fragsPerChr) possibleInterAllCount+= (totalNoOfFrags-tempLen)*tempLen # iterate over all possible intra-chr pairs to see which ones qualify as a 'possible' pair for x in range(tempLen): for y in range(x+1,tempLen): interactionDistance=abs(fragsPerChr[x]-fragsPerChr[y]) if myUtils.in_range_check(interactionDistance,distLowThres,distUpThres): countIntraPairs +=1 dictkey=str(i)+'-'+str(min(fragsPerChr[x],fragsPerChr[y]))+'-'+str(max(fragsPerChr[x],fragsPerChr[y])) possiblePairsPerDistance[dictkey]=[interactionDistance,0,1.0] # set count to zero for now and bias to 1.0 possibleIntraAllCount+=1 #END for #END for possibleIntraInRangeCount+=countIntraPairs sys.stderr.write("Chromosome " +repr(i) +",\t"+str(tempLen) +" mappable fragments, \t"+str(countIntraPairs)\ +" possible intra-chr fragment pairs in range,\t" + str((totalNoOfFrags-tempLen)*tempLen) +" possible inter-chr fragment pairs\n") #END for # divide the possibleInterAllCount by 2 so that every inter-chr interaction is counted only once possibleInterAllCount=possibleInterAllCount/2 sys.stderr.write("Total of \t"+str(possibleIntraInRangeCount) +" possible intra-chr fragment pairs in range,\t"\ +str(possibleIntraAllCount) +" possible intra-chr fragment pairs,\t"\ +str(possibleInterAllCount) +" possible inter-chr fragment pairs\n") # calculate inter-chr probabilities if possibleInterAllCount >0: baselineInterChrProb=1.0/possibleInterAllCount baselineIntraChrProb=1.0/possibleIntraAllCount return # from generate_FragPairs
def fit_Spline(x,y,yerr,infilename,sortedInteractions,biasDic,figname,passNo): sys.stderr.write("\nFit a univariate spline to the probability means\n") sys.stderr.write("------------------------------------------------------------------------------------\n") sys.stderr.write("baseline intra-chr probability: " + repr(baselineIntraChrProb)+ "\tbaseline inter-chr probability: " + repr(baselineInterChrProb)+"\n") # xi and yi will be used only for visualization purposes # acutal fit and residual is all done on vectors x and y xi = np.linspace(min(x), max(x), overSample*len(x)) # assume residualFactor==-1: splineError=min(y)*min(y) # use fitpack2 method -fit on the real x and y from equal occupancy binning ius = UnivariateSpline(x, y, s=splineError) yi = ius(xi) #### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING ### NOW I DO THIS BY CALLING AN R function CALLED MONOREG ### This does the isotonic regression using option antitonic to make sure ### I get monotonically decreasing probabilites with increasion genomic distance tempMaxX=max(x) tempMinX=min(x) tempList=sorted(list(set([int(i[0]) for i in sortedInteractions]))) splineX=[] ### The below for loop will make sure nothing is out of range of [min(x) max(x)] ### Therefore everything will be within the range where the spline is defined for i in tempList: if tempMinX<=i and i<=tempMaxX: splineX.append(i) # END for #print len(splineX) splineY=ius(splineX) # R vector format rSplineX=ro.FloatVector(splineX) rSplineY=ro.FloatVector(splineY) rMonoReg=ro.r['monoreg'] # do the antitonic regression allRres=rMonoReg(rSplineX,rSplineY,type="antitonic") rNewSplineY=allRres[3] # convert data back to Python format newSplineY=[] diff=[] diffX=[] for i in range(len(rNewSplineY)): newSplineY.append(rNewSplineY[i]) if (splineY[i]-newSplineY[i]) > 0: diff.append(splineY[i]-newSplineY[i]) diffX.append(splineX[i]) # END for #print len(splineX) residual =sum([i*i for i in (y - ius(x))]) if visual==True: ### Now plot the results sys.stderr.write("Plotting %s" % figname + ".png\n") plt.clf() fig = plt.figure() ax = fig.add_subplot(2,1,1) plt.plot(myUtils.scale_a_list(splineX,toKb), myUtils.scale_a_list(newSplineY,toProb),'g-',label="spline-"+str(passNo),linewidth=2) plt.errorbar(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list(y,toProb),myUtils.scale_a_list(yerr,toProb),fmt='r.',label="Mean with std. error",linewidth=2) if useInters: plt.plot(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list([baselineIntraChrProb for i in x],toProb),'k-',label="Baseline intra-chromosomal") plt.plot(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list([baselineIntraChrProb for i in x],toProb),'b-',label="Baseline inter-chromosomal") plt.ylabel('Contact probability (x10$^{-5}$)',fontsize='large') plt.xlabel('Genomic distance (kb)',fontsize='large') if distLowThres>-1 and distUpThres>-1: plt.xlim(myUtils.scale_a_list([distLowThres, distUpThres],toKb)) plt.gca().yaxis.set_major_locator( MaxNLocator(nbins = 3, prune=None)) ax.legend(loc="upper right") ax = fig.add_subplot(2,1,2) plt.loglog(splineX,newSplineY,'g-') plt.errorbar(x, y, yerr=yerr, fmt='r.') # Data if useInters: plt.loglog(x,[baselineIntraChrProb for i in x],'k-') plt.loglog(x,[baselineIntraChrProb for i in x],'b-') if distLowThres>-1 and distUpThres>-1: plt.xlim([distLowThres, distUpThres]) plt.ylabel('Contact probability (log-scale)',fontsize='large') plt.xlabel('Genomic distance (log-scale)',fontsize='large') plt.savefig(outdir+'/'+figname+'.png') # NOW write the calculated pvalues and corrected pvalues in a file infile =gzip.open(infilename, 'r') intraInRangeCount=0 intraOutOfRangeCount=0 intraVeryProximalCount=0 interCount=0 sys.stderr.write("distLowThres " + repr(distLowThres) + "\tdistUpThres " + repr(distUpThres) +"\n") p_vals=[] q_vals=[] for line in infile: words=line.rstrip().split() interxn=myUtils.Interaction([words[0], int(words[1]), words[2], int(words[3])]) interxn.setCount(int(words[4])) chr1=words[0] chr2=words[2] midPoint1=int(words[1]) midPoint2=int(words[3]) bias1=1.0; bias2=1.0; # assumes there is no bias to begin with # if the biasDic is not null sets the real bias values if len(biasDic)>0: if chr1 in biasDic and midPoint1 in biasDic[chr1]: bias1=biasDic[chr1][midPoint1] if chr2 in biasDic and midPoint2 in biasDic[chr2]: bias2=biasDic[chr2][midPoint2] if (bias1<0 or bias2<0) and interxn.type!='inter': prior_p=1.0 p_val=1.0 p_vals.append(p_val) elif interxn.getType(distLowThres,distUpThres)=='intraInRange': # make sure the interaction distance is covered by the probability bins distToLookUp=max(interxn.distance,min(x)) distToLookUp=min(distToLookUp,max(x)) i=min(bisect.bisect_left(splineX, distToLookUp),len(splineX)-1) #prior_p=newSplineY[i] prior_p=newSplineY[i]*(bias1*bias2) # biases added in the picture intraInRangeCount +=1 ############# THIS HAS TO BE interactionCount-1 ################## p_val=scsp.bdtrc(interxn.hitCount-1,observedIntraInRangeSum,prior_p) p_vals.append(p_val) elif interxn.getType(distLowThres,distUpThres)=='intraShort': prior_p=1.0 p_val=1.0 intraVeryProximalCount +=1 p_vals.append(p_val) elif interxn.getType(distLowThres,distUpThres)=='intraLong': # out of range bigger than distUpThres # use the prior of the baseline intra-chr interaction probability prior_p=1.0 #baselineIntraChrProb*(bias1*bias2) # biases added in the picture p_val=scsp.bdtrc(interxn.hitCount-1,observedIntraAllSum,prior_p) intraOutOfRangeCount +=1 p_vals.append(p_val) else: if useInters: #prior_p=baselineIntraChrProb prior_p=baselineInterChrProb*(bias1*bias2) # biases added in the picture ############# THIS HAS TO BE interactionCount-1 ################## p_val=scsp.bdtrc(interxn.hitCount-1,observedInterAllSum,prior_p) interCount +=1 p_vals.append(p_val) # END for infile.close() # Do the BH FDR correction if useInters: q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount+possibleIntraAllCount) sys.stderr.write("possibleInterAllCount+possibleIntraAllCount " + repr(possibleInterAllCount+possibleIntraAllCount)+"\n") else: q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleIntraInRangeCount) sys.stderr.write("possibleIntraInRangeCount " + repr(possibleIntraInRangeCount)+"\n") infile =gzip.open(infilename, 'r') outfile =gzip.open(outdir+'/'+figname+'.significances.txt.gz', 'w') sys.stderr.write("Writing p-values to file %s" % figname + ".significances.txt.gz\n") count=0 outfile.write("chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n") for line in infile: words=line.rstrip().split() chrNo1=words[0] midPoint1=int(words[1]) chrNo2=words[2] midPoint2=int(words[3]) interactionCount=int(words[4]) p_val=p_vals[count] q_val=q_vals[count] if useInters==False and chrNo1==chrNo2: # intra interactionDistance=abs(midPoint1-midPoint2) # dist if myUtils.in_range_check(interactionDistance,distLowThres,distUpThres): outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val)) elif useInters==True and chrNo1!=chrNo2: outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val)) #outfile.write("ALL\t%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val)) count+=1 # END for - printing pvals and qvals for all the interactions outfile.close() isOutlier=[] distsBelow=[] distsAbove=[] intcountsBelow=[] intcountsAbove=[] belowThresCount=0 aboveThresCount=0 outlierThres=1.0/possibleIntraInRangeCount for interactionDistance,interactionCount,bias12 in sortedInteractions: # make sure the interaction distance is covered by the probability bins distToLookUp=max(interactionDistance,min(x)) distToLookUp=min(distToLookUp,max(x)) i=min(bisect.bisect_left(splineX, distToLookUp),len(splineX)-1) prior_p=newSplineY[i]*float(bias12) # biases added in the picture ############# THIS HAS TO BE interactionCount-1 ################## p_val=scsp.bdtrc(interactionCount-1,observedIntraInRangeSum,prior_p) if p_val < outlierThres: distsBelow.append(interactionDistance) intcountsBelow.append(interactionCount) isOutlier.append(1) belowThresCount +=1 else: distsAbove.append(interactionDistance) intcountsAbove.append(interactionCount) isOutlier.append(0) aboveThresCount +=1 # END for - doing the outlier check for all interactions in sortedInteractions if visual==True: sys.stderr.write("Plotting results of extracting outliers to file %s" % figname + ".extractOutliers.png\n") plt.clf() fig = plt.figure() ax = fig.add_subplot(111) downsample=30 # for the non-outliers randIndcsAbove=sample([i for i in range(len(intcountsAbove))],len(intcountsAbove)/downsample) randIndcsAbove=sorted(randIndcsAbove) downsample=20 # for the outliers randIndcsBelow=sample([i for i in range(len(intcountsBelow))],len(intcountsBelow)/downsample) randIndcsBelow=sorted(randIndcsBelow) plt.plot(myUtils.scale_a_list([distsBelow[i] for i in randIndcsBelow],toKb),[intcountsBelow[i] for i in randIndcsBelow], 'r.',label="Outliers (p-value < 1/M)") plt.plot(myUtils.scale_a_list(splineX+[maxObservedGenomicDist],toKb),[newSplineY[i]*observedIntraInRangeSum for i in range(len(newSplineY))]+[newSplineY[-1]*observedIntraInRangeSum], 'g-', label="spline-"+str(passNo)+" (x N)", linewidth=2.5) plt.xlabel('Genomic distance (kb)') plt.ylabel('Contact counts') print(repr(len(intcountsBelow))+"\t"), ## this limits y-axis of the hit count plots if len(intcountsBelow)>0: plt.ylim([0,min(max(intcountsBelow),1500)]) if distLowThres>-1 and distUpThres>-1: plt.xlim([0, distUpThres*toKb]) ax.legend(loc="upper right",fancybox=True) plt.savefig(outdir+'/'+figname+'.extractOutliers.png') sys.stderr.write("intraInRangeCount " + repr(intraInRangeCount)+"\tintraOutOfRangeCount " +\ repr(intraOutOfRangeCount)+"\tintraVeryProximalCount " + repr(intraVeryProximalCount) +"\tinterCount " + repr(interCount)+"\n") if visual==True: sys.stderr.write("Plotting q-values to file %s" % figname + ".qplot.png\n") minFDR=0.0 maxFDR=0.05 increment=0.001 FDRx,FDRy=plot_qvalues(q_vals,minFDR,maxFDR,increment,figname+".qplot") infile.close() return [splineX, newSplineY, residual, isOutlier, FDRx, FDRy] # from fit_Spline