def call_bdtrc(hitCount, observedSum, prior_p, recursion=0): if recursion >= 10: return 1.0 p_val = scsp.bdtrc(int(hitCount), int(observedSum), prior_p) if np.isnan(p_val): p_val = call_bdtrc(int(hitCount / 2), int(observedSum / 2), prior_p, recursion + 1) return p_val
def _sf_single(self, x, n, a, b): k = floor(x) p = linspace(0, 1, num=10001) bta = btdtr(a, b, p) p_med = (p[:-1] + p[1:]) / 2 bta_med = bta[1:] - bta[:-1] vals = (bdtrc(k, n, p_med) * bta_med).sum(axis=-1) return vals
def call_bdtrc(hitCount, observedSum, prior_p, recursion=0): if (recursion>= 10): return 1.0 p_val=scsp.bdtrc(int(hitCount),int(observedSum),prior_p) if (np.isnan(p_val)): p_val=call_bdtrc(int(hitCount/2), int(observedSum/2), prior_p, recursion+1) return p_val
def grbbinomial_Pmin_raw(localProb, Ndraws): localProb = np.asarray(localProb) Ntail = len(localProb) # Cumulative binomial probability of getting (1+,2+,...Ntail+) events this improbable. # NB: stats.binom.sf maps to the lower level special.bdtrc P = special.bdtrc(np.arange(Ntail), Ndraws, localProb) index = P.argmin() Pmin_raw = P[index] return Pmin_raw, index + 1
def grbbinomialtest_threshold(Ndraws, Ntail, percentile, Nmc, discreteness=None, blocksize=10000): """ Adapted from https://trac.ligo.caltech.edu/xpipeline/browser/trunk/utilities/grbbinomialtest_threshold.m Ndraws is a scalar saying how many GRBs were analyzed in total Ntail is the number of loudest GRB events kept percentile is the desired percentile of the binomial probability distribution (This should be between 0 and 100!) Nmc is the number of Monte-Carlo simulations to perform in assessing significance discreteness is optional, but allows you to draw FAP values uniformly from multiples of 1 / discreteness Return the threshold on Pmin for the given percentile and an array of the FAPs corresponding to that threshold for each k=1..Ntail at which we evaluate the binomial probability. """ assert Ntail <= Ndraws if discreteness is None: draw = lambda n: stats.uniform.rvs(size=(n, Ndraws)) else: draw = lambda n: stats.randint.rvs( 0, discreteness + 1, size=(n, Ndraws)) / discreteness PminMC = [] num_drawn = 0 while num_drawn < Nmc: # draw random numbers in blocks to reduce memory num_to_draw = min(Nmc - num_drawn, blocksize) localProbMC = draw(num_to_draw) # keep Ntail most significant values of this block localProbMC.sort(axis=1) localProbMC = localProbMC[:, :Ntail] # NB: stats.binom.sf maps to the lower level special.bdtrc PMC = special.bdtrc(np.arange(Ntail)[None, :], Ndraws, localProbMC) PminMC.extend(PMC.min(axis=1)) num_drawn += num_to_draw # determine threshold on Pmin PminMC = np.asarray(PminMC) Pmin_thresh = stats.scoreatpercentile(PminMC, percentile) return Pmin_thresh
def grbbinomialtest_threshold(Ndraws, Ntail, percentile, Nmc, discreteness=None, blocksize=10000): """ Adapted from https://trac.ligo.caltech.edu/xpipeline/browser/trunk/utilities/grbbinomialtest_threshold.m Ndraws is a scalar saying how many GRBs were analyzed in total Ntail is the number of loudest GRB events kept percentile is the desired percentile of the binomial probability distribution (This should be between 0 and 100!) Nmc is the number of Monte-Carlo simulations to perform in assessing significance discreteness is optional, but allows you to draw FAP values uniformly from multiples of 1 / discreteness Return the threshold on Pmin for the given percentile and an array of the FAPs corresponding to that threshold for each k=1..Ntail at which we evaluate the binomial probability. """ assert Ntail <= Ndraws if discreteness is None: draw = lambda n: stats.uniform.rvs(size=(n, Ndraws)) else: draw = lambda n: stats.randint.rvs(0, discreteness + 1, size=(n, Ndraws)) / discreteness PminMC = [] num_drawn = 0 while num_drawn < Nmc: # draw random numbers in blocks to reduce memory num_to_draw = min(Nmc - num_drawn, blocksize) localProbMC = draw(num_to_draw) # keep Ntail most significant values of this block localProbMC.sort(axis=1) localProbMC = localProbMC[:, :Ntail] # NB: stats.binom.sf maps to the lower level special.bdtrc PMC = special.bdtrc(np.arange(Ntail)[None, :], Ndraws, localProbMC) PminMC.extend(PMC.min(axis=1)) num_drawn += num_to_draw # determine threshold on Pmin PminMC = np.asarray(PminMC) Pmin_thresh = stats.scoreatpercentile(PminMC, percentile) return Pmin_thresh
def grbbinomialtest(localProb, Ndraws, Nmc, discreteness=None): """ Adapted from https://trac.ligo.caltech.edu/xpipeline/browser/trunk/utilities/grbbinomialtest.m localProb is a *sorted* array of FAP values, one per GRB to be tested Ndraws is a scalar saying how many GRBs were analyzed in total Nmc is the number of Monte-Carlo simulations to perform in assessing significance. discreteness is optional, but allows you to draw FAP values uniformly from multiples of 1 / discreteness Pmin_raw Lowest cumulative binomial probability of the input set localProb. Note that this number does not account for the trials factor when length(localProb)>1. Pmin Probability that the tail of length(localProb) of a set of Ndraws uniformly distributed random numbers will give a cumulative binomial probability less than or equal to Pmin_raw. Nmin Number of tail values to include at which the binomial probability Pmin_raw occurs. """ Ntail = len(localProb) Pmin_raw, Nmin = grbbinomial_Pmin_raw(localProb, Ndraws) # Do a Monte-Carlo to determine significance if discreteness is None: localProbMC = stats.uniform.rvs(size=(Nmc, Ndraws)) else: localProbMC = stats.randint.rvs( 0, discreteness + 1, size=(Nmc, Ndraws)) / discreteness # keep the Ntail most significant values localProbMC.sort(axis=1) localProbMC = localProbMC[:, :Ntail] PMC = special.bdtrc(np.arange(Ntail)[None, :], Ndraws, localProbMC) PminMC = PMC.min(axis=1) Pmin = (PminMC <= Pmin_raw).mean() return Pmin_raw, Pmin, Nmin
def grbbinomialtest(localProb, Ndraws, Nmc, discreteness=None): """ Adapted from https://trac.ligo.caltech.edu/xpipeline/browser/trunk/utilities/grbbinomialtest.m localProb is a *sorted* array of FAP values, one per GRB to be tested Ndraws is a scalar saying how many GRBs were analyzed in total Nmc is the number of Monte-Carlo simulations to perform in assessing significance. discreteness is optional, but allows you to draw FAP values uniformly from multiples of 1 / discreteness Pmin_raw Lowest cumulative binomial probability of the input set localProb. Note that this number does not account for the trials factor when length(localProb)>1. Pmin Probability that the tail of length(localProb) of a set of Ndraws uniformly distributed random numbers will give a cumulative binomial probability less than or equal to Pmin_raw. Nmin Number of tail values to include at which the binomial probability Pmin_raw occurs. """ Ntail = len(localProb) Pmin_raw, Nmin = grbbinomial_Pmin_raw(localProb, Ndraws) # Do a Monte-Carlo to determine significance if discreteness is None: localProbMC = stats.uniform.rvs(size=(Nmc, Ndraws)) else: localProbMC = stats.randint.rvs(0, discreteness + 1, size=(Nmc, Ndraws)) / discreteness # keep the Ntail most significant values localProbMC.sort(axis=1) localProbMC = localProbMC[:, :Ntail] PMC = special.bdtrc(np.arange(Ntail)[None, :], Ndraws, localProbMC) PminMC = PMC.min(axis=1) Pmin = (PminMC <= Pmin_raw).mean() return Pmin_raw, Pmin, Nmin
def test_domain(self): val = sc.bdtrc(-1.1, 1, 0.5) val2 = sc.bdtrc(2.1, 1, 0.5) assert np.isnan(val2) assert_allclose(val, 1.0)
def test_bdtr_bdtrc_sum_to_one(self): bdtr_vals = sc.bdtr([0, 1, 2], 2, 0.5) bdtrc_vals = sc.bdtrc([0, 1, 2], 2, 0.5) vals = bdtr_vals + bdtrc_vals assert_allclose(vals, [1.0, 1.0, 1.0])
def calculateSignificant(outfilename, infilename, splineX, splineY, possibleIntraInRangeCount, observedIntraInRangeSum, possibleInterAllCount, observedInterAllSum, lowThres, upThres, passNo, region='intraOnly'): print("Calculating p-values and q-values for all pairs from input file...") print("--------------------------------------------------") newAllReads = read_countsFile(infilename, lowThres, upThres, silence=True) CCNT = newAllReads.contactCount.values DIST = newAllReads.distance.values ITYPE = newAllReads.contactType.values allReg, interOnly = region_parser(region, silence=True) p_vals = [] for cc, dist, itype in zip(CCNT, DIST, ITYPE): bias1 = 1.0 bias2 = 1.0 # not use bias file so far if (bias1 < 0 or bias2 < 0) and itype != 'inter': prior_p = 1.0 p_val = 1.0 elif itype == 'intraInRange' and not interOnly: i = bisect.bisect_left(splineX, dist) prior_p = splineY[i] * (bias1 * bias2) p_val = bdtrc(cc - 1, observedIntraInRangeSum, prior_p) elif itype == 'intraShort' and not interOnly: prior_p = 1.0 p_val = 1.0 elif itype == 'intraLong' and not interOnly: prior_p = 1.0 p_val = 1.0 else: if allReg or interOnly: prior_p = interChrProb * (bias1 * bias2) p_val = bdtrc(cc - 1, observedInterAllSum, prior_p) else: p_val = 1.0 p_vals.append(p_val) # Do the BH FDR correction if allReg: totalValidCount = possibleIntraInRangeCount + possibleInterAllCount elif interOnly and not allReg: totalValidCount = possibleInterAllCount else: totalValidCount = possibleIntraInRangeCount outlierThres = 1.0 / totalValidCount q_vals = bh_correction(p_vals, totalValidCount) print(f'The calculation of p-values and q-values finished!') print(f'>>>> Writing to {outfilename}.significant.gz') newAllReads['p_vals'] = p_vals newAllReads['q_vals'] = q_vals newAllReads.to_csv(f'{outfilename}.significant.gz', sep='\t', index=False, compression='gzip') print(f'>>>> p-vals and q-vals written to {outfilename}.significant.gz') # Find all outliers outlierReads = newAllReads[newAllReads.p_vals < outlierThres] outliersline = sorted(outlierReads.index.tolist()) outliersdist = sorted(outlierReads.distance.tolist()) print(f'Outlier threshold is: {outlierThres:.6e}') print(f'Found outlier pairs: {len(outliersline)}') print(f'>>>> Plotting q-values to file {outfilename}.qplot.svg') FDRx, FDRy = plot_qvalues(outfilename, q_vals, minFDR=0, maxFDR=0.05, increment=1e-3) return outliersline, outliersdist, FDRx, FDRy
def test_inf(self, k, n, p): with suppress_warnings() as sup: sup.filter(DeprecationWarning) val = sc.bdtrc(k, n, p) assert np.isnan(val)
def test_legacy_cast(): with suppress_warnings() as sup: sup.filter(RuntimeWarning, "floating point number truncated to an integer") res = sc.bdtrc(np.nan, 1, 0.5) assert_(np.isnan(res))
def fit_Spline(x, y, yerr, infilename, sortedInteractions, biasDic, figname, passNo): sys.stderr.write("\nFit a univariate spline to the probability means\n") sys.stderr.write( "------------------------------------------------------------------------------------\n" ) sys.stderr.write("baseline intra-chr probability: " + repr(baselineIntraChrProb) + "\tbaseline inter-chr probability: " + repr(baselineInterChrProb) + "\n") # xi and yi will be used only for visualization purposes # acutal fit and residual is all done on vectors x and y xi = np.linspace(min(x), max(x), overSample * len(x)) # assume residualFactor==-1: splineError = min(y) * min(y) # use fitpack2 method -fit on the real x and y from equal occupancy binning ius = UnivariateSpline(x, y, s=splineError) yi = ius(xi) #### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING ### NOW I DO THIS BY CALLING AN R function CALLED MONOREG ### This does the isotonic regression using option antitonic to make sure ### I get monotonically decreasing probabilites with increasion genomic distance tempMaxX = max(x) tempMinX = min(x) tempList = sorted(list(set([int(i[0]) for i in sortedInteractions]))) splineX = [] ### The below for loop will make sure nothing is out of range of [min(x) max(x)] ### Therefore everything will be within the range where the spline is defined for i in tempList: if tempMinX <= i and i <= tempMaxX: splineX.append(i) # END for #print len(splineX) splineY = ius(splineX) # R vector format rSplineX = ro.FloatVector(splineX) rSplineY = ro.FloatVector(splineY) rMonoReg = ro.r['monoreg'] # do the antitonic regression allRres = rMonoReg(rSplineX, rSplineY, type="antitonic") rNewSplineY = allRres[3] # convert data back to Python format newSplineY = [] diff = [] diffX = [] for i in range(len(rNewSplineY)): newSplineY.append(rNewSplineY[i]) if (splineY[i] - newSplineY[i]) > 0: diff.append(splineY[i] - newSplineY[i]) diffX.append(splineX[i]) # END for #print len(splineX) residual = sum([i * i for i in (y - ius(x))]) if visual == True: ### Now plot the results sys.stderr.write("Plotting %s" % figname + ".png\n") plt.clf() fig = plt.figure() ax = fig.add_subplot(2, 1, 1) plt.plot(myUtils.scale_a_list(splineX, toKb), myUtils.scale_a_list(newSplineY, toProb), 'g-', label="spline-" + str(passNo), linewidth=2) plt.errorbar(myUtils.scale_a_list(x, toKb), myUtils.scale_a_list(y, toProb), myUtils.scale_a_list(yerr, toProb), fmt='r.', label="Mean with std. error", linewidth=2) if useInters: plt.plot(myUtils.scale_a_list(x, toKb), myUtils.scale_a_list([baselineIntraChrProb for i in x], toProb), 'k-', label="Baseline intra-chromosomal") plt.plot(myUtils.scale_a_list(x, toKb), myUtils.scale_a_list([baselineIntraChrProb for i in x], toProb), 'b-', label="Baseline inter-chromosomal") plt.ylabel('Contact probability (x10$^{-5}$)', fontsize='large') plt.xlabel('Genomic distance (kb)', fontsize='large') if distLowThres > -1 and distUpThres > -1: plt.xlim(myUtils.scale_a_list([distLowThres, distUpThres], toKb)) plt.gca().yaxis.set_major_locator(MaxNLocator(nbins=3, prune=None)) ax.legend(loc="upper right") ax = fig.add_subplot(2, 1, 2) plt.loglog(splineX, newSplineY, 'g-') plt.errorbar(x, y, yerr=yerr, fmt='r.') # Data if useInters: plt.loglog(x, [baselineIntraChrProb for i in x], 'k-') plt.loglog(x, [baselineIntraChrProb for i in x], 'b-') if distLowThres > -1 and distUpThres > -1: plt.xlim([distLowThres, distUpThres]) plt.ylabel('Contact probability (log-scale)', fontsize='large') plt.xlabel('Genomic distance (log-scale)', fontsize='large') plt.savefig(outdir + '/' + figname + '.png') # NOW write the calculated pvalues and corrected pvalues in a file infile = gzip.open(infilename, 'r') intraInRangeCount = 0 intraOutOfRangeCount = 0 intraVeryProximalCount = 0 interCount = 0 sys.stderr.write("distLowThres " + repr(distLowThres) + "\tdistUpThres " + repr(distUpThres) + "\n") p_vals = [] q_vals = [] for line in infile: words = line.rstrip().split() interxn = myUtils.Interaction( [words[0], int(words[1]), words[2], int(words[3])]) interxn.setCount(int(words[4])) chr1 = words[0] chr2 = words[2] midPoint1 = int(words[1]) midPoint2 = int(words[3]) bias1 = 1.0 bias2 = 1.0 # assumes there is no bias to begin with # if the biasDic is not null sets the real bias values if len(biasDic) > 0: if chr1 in biasDic and midPoint1 in biasDic[chr1]: bias1 = biasDic[chr1][midPoint1] if chr2 in biasDic and midPoint2 in biasDic[chr2]: bias2 = biasDic[chr2][midPoint2] if (bias1 < 0 or bias2 < 0) and interxn.type != 'inter': prior_p = 1.0 p_val = 1.0 p_vals.append(p_val) elif interxn.getType(distLowThres, distUpThres) == 'intraInRange': # make sure the interaction distance is covered by the probability bins distToLookUp = max(interxn.distance, min(x)) distToLookUp = min(distToLookUp, max(x)) i = min(bisect.bisect_left(splineX, distToLookUp), len(splineX) - 1) #prior_p=newSplineY[i] prior_p = newSplineY[i] * (bias1 * bias2 ) # biases added in the picture intraInRangeCount += 1 ############# THIS HAS TO BE interactionCount-1 ################## p_val = scsp.bdtrc(interxn.hitCount - 1, observedIntraInRangeSum, prior_p) p_vals.append(p_val) elif interxn.getType(distLowThres, distUpThres) == 'intraShort': prior_p = 1.0 p_val = 1.0 intraVeryProximalCount += 1 p_vals.append(p_val) elif interxn.getType(distLowThres, distUpThres) == 'intraLong': # out of range bigger than distUpThres # use the prior of the baseline intra-chr interaction probability prior_p = 1.0 #baselineIntraChrProb*(bias1*bias2) # biases added in the picture p_val = scsp.bdtrc(interxn.hitCount - 1, observedIntraAllSum, prior_p) intraOutOfRangeCount += 1 p_vals.append(p_val) else: if useInters: #prior_p=baselineIntraChrProb prior_p = baselineInterChrProb * ( bias1 * bias2) # biases added in the picture ############# THIS HAS TO BE interactionCount-1 ################## p_val = scsp.bdtrc(interxn.hitCount - 1, observedInterAllSum, prior_p) interCount += 1 p_vals.append(p_val) # END for infile.close() # Do the BH FDR correction if useInters: q_vals = myStats.benjamini_hochberg_correction( p_vals, possibleInterAllCount + possibleIntraAllCount) sys.stderr.write("possibleInterAllCount+possibleIntraAllCount " + repr(possibleInterAllCount + possibleIntraAllCount) + "\n") else: q_vals = myStats.benjamini_hochberg_correction( p_vals, possibleIntraInRangeCount) sys.stderr.write("possibleIntraInRangeCount " + repr(possibleIntraInRangeCount) + "\n") infile = gzip.open(infilename, 'r') outfile = gzip.open(outdir + '/' + figname + '.significances.txt.gz', 'w') sys.stderr.write("Writing p-values to file %s" % figname + ".significances.txt.gz\n") count = 0 outfile.write( "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n" ) for line in infile: words = line.rstrip().split() chrNo1 = words[0] midPoint1 = int(words[1]) chrNo2 = words[2] midPoint2 = int(words[3]) interactionCount = int(words[4]) p_val = p_vals[count] q_val = q_vals[count] if useInters == False and chrNo1 == chrNo2: # intra interactionDistance = abs(midPoint1 - midPoint2) # dist if myUtils.in_range_check(interactionDistance, distLowThres, distUpThres): outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1), midPoint1, str(chrNo2), midPoint2, interactionCount, p_val, q_val)) elif useInters == True and chrNo1 != chrNo2: outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1), midPoint1, str(chrNo2), midPoint2, interactionCount, p_val, q_val)) #outfile.write("ALL\t%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val)) count += 1 # END for - printing pvals and qvals for all the interactions outfile.close() isOutlier = [] distsBelow = [] distsAbove = [] intcountsBelow = [] intcountsAbove = [] belowThresCount = 0 aboveThresCount = 0 outlierThres = 1.0 / possibleIntraInRangeCount for interactionDistance, interactionCount, bias12 in sortedInteractions: # make sure the interaction distance is covered by the probability bins distToLookUp = max(interactionDistance, min(x)) distToLookUp = min(distToLookUp, max(x)) i = min(bisect.bisect_left(splineX, distToLookUp), len(splineX) - 1) prior_p = newSplineY[i] * float(bias12) # biases added in the picture ############# THIS HAS TO BE interactionCount-1 ################## p_val = scsp.bdtrc(interactionCount - 1, observedIntraInRangeSum, prior_p) if p_val < outlierThres: distsBelow.append(interactionDistance) intcountsBelow.append(interactionCount) isOutlier.append(1) belowThresCount += 1 else: distsAbove.append(interactionDistance) intcountsAbove.append(interactionCount) isOutlier.append(0) aboveThresCount += 1 # END for - doing the outlier check for all interactions in sortedInteractions if visual == True: sys.stderr.write("Plotting results of extracting outliers to file %s" % figname + ".extractOutliers.png\n") plt.clf() fig = plt.figure() ax = fig.add_subplot(111) downsample = 30 # for the non-outliers randIndcsAbove = sample([i for i in range(len(intcountsAbove))], len(intcountsAbove) / downsample) randIndcsAbove = sorted(randIndcsAbove) downsample = 20 # for the outliers randIndcsBelow = sample([i for i in range(len(intcountsBelow))], len(intcountsBelow) / downsample) randIndcsBelow = sorted(randIndcsBelow) plt.plot(myUtils.scale_a_list([distsBelow[i] for i in randIndcsBelow], toKb), [intcountsBelow[i] for i in randIndcsBelow], 'r.', label="Outliers (p-value < 1/M)") plt.plot(myUtils.scale_a_list(splineX + [maxObservedGenomicDist], toKb), [ newSplineY[i] * observedIntraInRangeSum for i in range(len(newSplineY)) ] + [newSplineY[-1] * observedIntraInRangeSum], 'g-', label="spline-" + str(passNo) + " (x N)", linewidth=2.5) plt.xlabel('Genomic distance (kb)') plt.ylabel('Contact counts') print(repr(len(intcountsBelow)) + "\t"), ## this limits y-axis of the hit count plots if len(intcountsBelow) > 0: plt.ylim([0, min(max(intcountsBelow), 1500)]) if distLowThres > -1 and distUpThres > -1: plt.xlim([0, distUpThres * toKb]) ax.legend(loc="upper right", fancybox=True) plt.savefig(outdir + '/' + figname + '.extractOutliers.png') sys.stderr.write("intraInRangeCount " + repr(intraInRangeCount)+"\tintraOutOfRangeCount " +\ repr(intraOutOfRangeCount)+"\tintraVeryProximalCount " + repr(intraVeryProximalCount) +"\tinterCount " + repr(interCount)+"\n") if visual == True: sys.stderr.write("Plotting q-values to file %s" % figname + ".qplot.png\n") minFDR = 0.0 maxFDR = 0.05 increment = 0.001 FDRx, FDRy = plot_qvalues(q_vals, minFDR, maxFDR, increment, figname + ".qplot") infile.close() return [splineX, newSplineY, residual, isOutlier, FDRx, FDRy] # from fit_Spline
def fit_Spline(x,y,yerr,infilename,sortedInteractions,biasDic,figname,passNo): sys.stderr.write("\nFit a univariate spline to the probability means\n") sys.stderr.write("------------------------------------------------------------------------------------\n") sys.stderr.write("baseline intra-chr probability: " + repr(baselineIntraChrProb)+ "\tbaseline inter-chr probability: " + repr(baselineInterChrProb)+"\n") # xi and yi will be used only for visualization purposes # acutal fit and residual is all done on vectors x and y xi = np.linspace(min(x), max(x), overSample*len(x)) # assume residualFactor==-1: splineError=min(y)*min(y) # use fitpack2 method -fit on the real x and y from equal occupancy binning ius = UnivariateSpline(x, y, s=splineError) yi = ius(xi) #### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING ### NOW I DO THIS BY CALLING AN R function CALLED MONOREG ### This does the isotonic regression using option antitonic to make sure ### I get monotonically decreasing probabilites with increasion genomic distance tempMaxX=max(x) tempMinX=min(x) tempList=sorted(list(set([int(i[0]) for i in sortedInteractions]))) splineX=[] ### The below for loop will make sure nothing is out of range of [min(x) max(x)] ### Therefore everything will be within the range where the spline is defined for i in tempList: if tempMinX<=i and i<=tempMaxX: splineX.append(i) # END for #print len(splineX) splineY=ius(splineX) # R vector format rSplineX=ro.FloatVector(splineX) rSplineY=ro.FloatVector(splineY) rMonoReg=ro.r['monoreg'] # do the antitonic regression allRres=rMonoReg(rSplineX,rSplineY,type="antitonic") rNewSplineY=allRres[3] # convert data back to Python format newSplineY=[] diff=[] diffX=[] for i in range(len(rNewSplineY)): newSplineY.append(rNewSplineY[i]) if (splineY[i]-newSplineY[i]) > 0: diff.append(splineY[i]-newSplineY[i]) diffX.append(splineX[i]) # END for #print len(splineX) residual =sum([i*i for i in (y - ius(x))]) if visual==True: ### Now plot the results sys.stderr.write("Plotting %s" % figname + ".png\n") plt.clf() fig = plt.figure() ax = fig.add_subplot(2,1,1) plt.plot(myUtils.scale_a_list(splineX,toKb), myUtils.scale_a_list(newSplineY,toProb),'g-',label="spline-"+str(passNo),linewidth=2) plt.errorbar(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list(y,toProb),myUtils.scale_a_list(yerr,toProb),fmt='r.',label="Mean with std. error",linewidth=2) if useInters: plt.plot(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list([baselineIntraChrProb for i in x],toProb),'k-',label="Baseline intra-chromosomal") plt.plot(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list([baselineIntraChrProb for i in x],toProb),'b-',label="Baseline inter-chromosomal") plt.ylabel('Contact probability (x10$^{-5}$)',fontsize='large') plt.xlabel('Genomic distance (kb)',fontsize='large') if distLowThres>-1 and distUpThres>-1: plt.xlim(myUtils.scale_a_list([distLowThres, distUpThres],toKb)) plt.gca().yaxis.set_major_locator( MaxNLocator(nbins = 3, prune=None)) ax.legend(loc="upper right") ax = fig.add_subplot(2,1,2) plt.loglog(splineX,newSplineY,'g-') plt.errorbar(x, y, yerr=yerr, fmt='r.') # Data if useInters: plt.loglog(x,[baselineIntraChrProb for i in x],'k-') plt.loglog(x,[baselineIntraChrProb for i in x],'b-') if distLowThres>-1 and distUpThres>-1: plt.xlim([distLowThres, distUpThres]) plt.ylabel('Contact probability (log-scale)',fontsize='large') plt.xlabel('Genomic distance (log-scale)',fontsize='large') plt.savefig(outdir+'/'+figname+'.png') # NOW write the calculated pvalues and corrected pvalues in a file infile =gzip.open(infilename, 'r') intraInRangeCount=0 intraOutOfRangeCount=0 intraVeryProximalCount=0 interCount=0 sys.stderr.write("distLowThres " + repr(distLowThres) + "\tdistUpThres " + repr(distUpThres) +"\n") p_vals=[] q_vals=[] for line in infile: words=line.rstrip().split() interxn=myUtils.Interaction([words[0], int(words[1]), words[2], int(words[3])]) interxn.setCount(int(words[4])) chr1=words[0] chr2=words[2] midPoint1=int(words[1]) midPoint2=int(words[3]) bias1=1.0; bias2=1.0; # assumes there is no bias to begin with # if the biasDic is not null sets the real bias values if len(biasDic)>0: if chr1 in biasDic and midPoint1 in biasDic[chr1]: bias1=biasDic[chr1][midPoint1] if chr2 in biasDic and midPoint2 in biasDic[chr2]: bias2=biasDic[chr2][midPoint2] if (bias1<0 or bias2<0) and interxn.type!='inter': prior_p=1.0 p_val=1.0 p_vals.append(p_val) elif interxn.getType(distLowThres,distUpThres)=='intraInRange': # make sure the interaction distance is covered by the probability bins distToLookUp=max(interxn.distance,min(x)) distToLookUp=min(distToLookUp,max(x)) i=min(bisect.bisect_left(splineX, distToLookUp),len(splineX)-1) #prior_p=newSplineY[i] prior_p=newSplineY[i]*(bias1*bias2) # biases added in the picture intraInRangeCount +=1 ############# THIS HAS TO BE interactionCount-1 ################## p_val=scsp.bdtrc(interxn.hitCount-1,observedIntraInRangeSum,prior_p) p_vals.append(p_val) elif interxn.getType(distLowThres,distUpThres)=='intraShort': prior_p=1.0 p_val=1.0 intraVeryProximalCount +=1 p_vals.append(p_val) elif interxn.getType(distLowThres,distUpThres)=='intraLong': # out of range bigger than distUpThres # use the prior of the baseline intra-chr interaction probability prior_p=1.0 #baselineIntraChrProb*(bias1*bias2) # biases added in the picture p_val=scsp.bdtrc(interxn.hitCount-1,observedIntraAllSum,prior_p) intraOutOfRangeCount +=1 p_vals.append(p_val) else: if useInters: #prior_p=baselineIntraChrProb prior_p=baselineInterChrProb*(bias1*bias2) # biases added in the picture ############# THIS HAS TO BE interactionCount-1 ################## p_val=scsp.bdtrc(interxn.hitCount-1,observedInterAllSum,prior_p) interCount +=1 p_vals.append(p_val) # END for infile.close() # Do the BH FDR correction if useInters: q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount+possibleIntraAllCount) sys.stderr.write("possibleInterAllCount+possibleIntraAllCount " + repr(possibleInterAllCount+possibleIntraAllCount)+"\n") else: q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleIntraInRangeCount) sys.stderr.write("possibleIntraInRangeCount " + repr(possibleIntraInRangeCount)+"\n") infile =gzip.open(infilename, 'r') outfile =gzip.open(outdir+'/'+figname+'.significances.txt.gz', 'w') sys.stderr.write("Writing p-values to file %s" % figname + ".significances.txt.gz\n") count=0 outfile.write("chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n") for line in infile: words=line.rstrip().split() chrNo1=words[0] midPoint1=int(words[1]) chrNo2=words[2] midPoint2=int(words[3]) interactionCount=int(words[4]) p_val=p_vals[count] q_val=q_vals[count] if useInters==False and chrNo1==chrNo2: # intra interactionDistance=abs(midPoint1-midPoint2) # dist if myUtils.in_range_check(interactionDistance,distLowThres,distUpThres): outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val)) elif useInters==True and chrNo1!=chrNo2: outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val)) #outfile.write("ALL\t%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val)) count+=1 # END for - printing pvals and qvals for all the interactions outfile.close() isOutlier=[] distsBelow=[] distsAbove=[] intcountsBelow=[] intcountsAbove=[] belowThresCount=0 aboveThresCount=0 outlierThres=1.0/possibleIntraInRangeCount for interactionDistance,interactionCount,bias12 in sortedInteractions: # make sure the interaction distance is covered by the probability bins distToLookUp=max(interactionDistance,min(x)) distToLookUp=min(distToLookUp,max(x)) i=min(bisect.bisect_left(splineX, distToLookUp),len(splineX)-1) prior_p=newSplineY[i]*float(bias12) # biases added in the picture ############# THIS HAS TO BE interactionCount-1 ################## p_val=scsp.bdtrc(interactionCount-1,observedIntraInRangeSum,prior_p) if p_val < outlierThres: distsBelow.append(interactionDistance) intcountsBelow.append(interactionCount) isOutlier.append(1) belowThresCount +=1 else: distsAbove.append(interactionDistance) intcountsAbove.append(interactionCount) isOutlier.append(0) aboveThresCount +=1 # END for - doing the outlier check for all interactions in sortedInteractions if visual==True: sys.stderr.write("Plotting results of extracting outliers to file %s" % figname + ".extractOutliers.png\n") plt.clf() fig = plt.figure() ax = fig.add_subplot(111) downsample=30 # for the non-outliers randIndcsAbove=sample([i for i in range(len(intcountsAbove))],len(intcountsAbove)/downsample) randIndcsAbove=sorted(randIndcsAbove) downsample=20 # for the outliers randIndcsBelow=sample([i for i in range(len(intcountsBelow))],len(intcountsBelow)/downsample) randIndcsBelow=sorted(randIndcsBelow) plt.plot(myUtils.scale_a_list([distsBelow[i] for i in randIndcsBelow],toKb),[intcountsBelow[i] for i in randIndcsBelow], 'r.',label="Outliers (p-value < 1/M)") plt.plot(myUtils.scale_a_list(splineX+[maxObservedGenomicDist],toKb),[newSplineY[i]*observedIntraInRangeSum for i in range(len(newSplineY))]+[newSplineY[-1]*observedIntraInRangeSum], 'g-', label="spline-"+str(passNo)+" (x N)", linewidth=2.5) plt.xlabel('Genomic distance (kb)') plt.ylabel('Contact counts') print(repr(len(intcountsBelow))+"\t"), ## this limits y-axis of the hit count plots if len(intcountsBelow)>0: plt.ylim([0,min(max(intcountsBelow),1500)]) if distLowThres>-1 and distUpThres>-1: plt.xlim([0, distUpThres*toKb]) ax.legend(loc="upper right",fancybox=True) plt.savefig(outdir+'/'+figname+'.extractOutliers.png') sys.stderr.write("intraInRangeCount " + repr(intraInRangeCount)+"\tintraOutOfRangeCount " +\ repr(intraOutOfRangeCount)+"\tintraVeryProximalCount " + repr(intraVeryProximalCount) +"\tinterCount " + repr(interCount)+"\n") if visual==True: sys.stderr.write("Plotting q-values to file %s" % figname + ".qplot.png\n") minFDR=0.0 maxFDR=0.05 increment=0.001 FDRx,FDRy=plot_qvalues(q_vals,minFDR,maxFDR,increment,figname+".qplot") infile.close() return [splineX, newSplineY, residual, isOutlier, FDRx, FDRy] # from fit_Spline
def fit_Spline(mainDic,x,y,yerr,infilename,outfilename,biasDic,outliersline,outliersdist,observedIntraInRangeSum, possibleIntraInRangeCount, possibleInterAllCount, observedIntraAllSum, observedInterAllSum, resolution, passNo): with open(logfile, 'a') as log: log.write("\nFitting a univariate spline to the probability means\n"), log.write("------------------------------------------------------------------------------------\n"), splineX = None newSplineY = None residual = None FDRx = None FDRy = None if not interOnly: if outliersdist != None: y = [f for _, f in sorted(zip(x,y), key=lambda pair: pair[0])] x.sort() for i in range(1,len(x)): if x[i]<=x[i-1]: print("ERROR in spline fitting. Distances do not decrease across bins. Ensure interaction file is correct.") print("Avg. distance of bin(i-1)... %s" % x[i-1]) print("Avg. distance of bin(i)... %s" % x[i]) sys.exit(2) # maximum residual allowed for spline is set to min(y)^2 splineError=min(y)*min(y) # use fitpack2 method -fit on the real x and y from equal occupancy binning ius = UnivariateSpline(x, y, s=splineError) tempMaxX=max(x) tempMinX=min(x) tempList=sorted([dis for dis in mainDic]) splineX=[] ### The below for loop will make sure nothing is out of range of [min(x) max(x)] ### Therefore everything will be within the range where the spline is defined for i in tempList: if tempMinX<=i<=tempMaxX: splineX.append(i) splineY=ius(splineX) #print(splineY) #print(yerr) ir = IsotonicRegression(increasing=False) newSplineY = ir.fit_transform(splineX,splineY) #print(newSplineY) residual =sum([i*i for i in (y - ius(x))]) if visual==True: xi = np.linspace(min(x),max(x),5*len(x)) yi = ius(xi) print("Plotting %s" % (outfilename + ".png")) plt.clf() fig = plt.figure() ax = fig.add_subplot(2,1,1) plt.plot(myUtils.scale_a_list(splineX,toKb), myUtils.scale_a_list(newSplineY,toProb),'g-',label="spline-"+str(passNo),linewidth=2) plt.errorbar(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list(y,toProb),myUtils.scale_a_list(yerr,toProb),fmt='r.',label="Mean with std. error",linewidth=2) #plt.ylabel('Contact probability (x10$^{-5}$)',fontsize='large') #plt.xlabel('Genomic distance (kb)',fontsize='large') plt.ylabel('Contact probability (x10$^{-5}$)') plt.xlabel('Genomic distance (kb)') if distLowThres>0 and distUpThres<float("inf"): plt.xlim(myUtils.scale_a_list([distLowThres, distUpThres],toKb)) plt.gca().yaxis.set_major_locator( MaxNLocator(nbins = 3, prune=None)) ax.legend(loc="upper right") ax = fig.add_subplot(2,1,2) plt.loglog(splineX,newSplineY,'g-') plt.errorbar(x, y, yerr=yerr, fmt='r.') # Data if distLowThres>0 and distUpThres<float("inf"): plt.xlim([distLowThres, distUpThres]) plt.ylabel('Contact probability (log-scale)') plt.xlabel('Genomic distance (log-scale)') plt.savefig(outfilename+'.png') # NOW write the calculated pvalues and corrected pvalues in a file infile = gzip.open(infilename, 'rt') intraInRangeCount=0 intraOutOfRangeCount=0 intraVeryProximalCount=0 interCount=0 discardCount=0 p_vals=[] q_vals=[] biasl=[] biasr=[] for line in infile: ch1,mid1,ch2,mid2,contactCount=line.rstrip().split() contactCount = float(contactCount) interxn=myUtils.Interaction([ch1, int(mid1), ch2, int(mid2)]) interxn.setCount(contactCount) mid1 = int(mid1); mid2 = int(mid2) interactionType = interxn.getType(distLowThres,distUpThres) bias1=1.0; bias2=1.0; # assumes there is no bias to begin with # if the biasDic is not null sets the real bias values if biasDic: if ch1 not in biasDic: print("Warning. Bias file does not contain chromosome %s. \ Please ensure you're using correct file. Fit-Hi-C will continue with\ bias = -1 for this locus" % ch1) bias1 = -1 else: if mid1 not in biasDic[ch1]: print("Error. Bias file does not contain midpoint %s within \ %s. Please ensure you're using the correct file and/or resolution \ argument. Fit-Hi-C will continue with bias = -1 for this locus" \ % (mid1, ch1)) bias1 = -1 else: bias1=biasDic[ch1][mid1] if ch2 not in biasDic: print("Warning. Bias file does not contain chromosome %s. \ Please ensure you're using correct file. Fit-Hi-C will continue with\ bias = -1 for this locus" % ch2) bias2 = -1 else: if mid2 not in biasDic[ch2]: print("Error. Bias file does not contain midpoint %s within \ %s. Please ensure you're using the correct file and/or resolution \ argument. Fit-Hi-C will continue with bias = -1 for this locus" \ % (mid2, ch2)) bias2 = -1 else: bias2=biasDic[ch2][mid2] biasl.append(bias1) biasr.append(bias2) if (bias1<0 or bias2<0) and interactionType !='inter': prior_p=1.0 p_val=1.0 discardCount+=1 elif interactionType=='intraInRange' and not interOnly: distToLookUp=max(interxn.getDistance(),min(x)) distToLookUp=min(distToLookUp,max(x)) i=min(bisect.bisect_left(splineX, distToLookUp),len(splineX)-1) prior_p=newSplineY[i]*(bias1*bias2) p_val=scsp.bdtrc(interxn.getCount()-1,observedIntraInRangeSum,prior_p) intraInRangeCount +=1 elif interactionType =='intraShort' and not interOnly: prior_p=1.0 p_val=1.0 intraVeryProximalCount += 1 elif interactionType =='intraLong' and not interOnly: prior_p=1.0 #p_val=scsp.bdtrc(interxn.getCount()-1, observedIntraAllSum,prior_p) ##RUNBY p_val=1.0 intraOutOfRangeCount += 1 else: if allReg or interOnly: prior_p=interChrProb*(bias1*bias2) p_val=scsp.bdtrc(interxn.getCount()-1,observedInterAllSum,prior_p) interCount += 1 else: p_val=1.0 #p_vals.append(p_val) p_vals.append(p_val) infile.close() outlierThres = 0 # Do the BH FDR correction if allReg: outlierThres=1.0/(possibleIntraInRangeCount+possibleInterAllCount) q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount+possibleIntraInRangeCount) elif interOnly and not allReg: outlierThres = 1.0/possibleInterAllCount q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount) else: outlierThres = 1.0/possibleIntraInRangeCount q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleIntraInRangeCount) print("Outlier threshold is... %s" % (outlierThres)) #now we write the values back to the file infile =gzip.open(infilename, 'rt') if resolution: outfile =gzip.open(outfilename+'.res'+str(resolution)+'.significances.txt.gz', 'wt') else: outfile =gzip.open(outfilename+'.significances.txt.gz', 'wt') print("Writing p-values and q-values to file %s" % (outfilename + ".significances.txt")) outfile.write("chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\tbias1\tbias2\n") count=0 for line in infile: words=line.rstrip().split() chr1=words[0] midPoint1=int(words[1]) chr2=words[2] midPoint2=int(words[3]) interactionCount=float(words[4]) p_val=p_vals[count] q_val=q_vals[count] bias1=biasl[count] bias2=biasr[count] if (allReg or interOnly) and chr1!=chr2: outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" % (str(chr1), midPoint1, str(chr2), midPoint2, interactionCount, p_val, q_val, bias1, bias2)) if (allReg or not interOnly) and chr1==chr2: interactionDistance = abs(midPoint1-midPoint2) if myUtils.in_range_check(interactionDistance,distLowThres, distUpThres): outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" % (str(chr1), midPoint1, str(chr2), midPoint2, interactionCount, p_val, q_val, bias1, bias2)) if p_val<outlierThres: outliersline.add(count) outliersdist.add(abs(midPoint1-midPoint2)) count+=1 outfile.close() infile.close() if visual == True: print("Plotting q-values to file %s" % outfilename + ".qplot.png") minFDR=0.0 maxFDR=0.05 increment=0.001 FDRx,FDRy=plot_qvalues(q_vals,minFDR,maxFDR,increment,outfilename+".qplot") with open(logfile, 'a') as log: log.write("Spline successfully fit\n"), log.write("\n"), log.write("\n"), return [splineX, newSplineY, residual, outliersline, outliersdist, FDRx, FDRy] # from fit_Spline
def fit_Spline(mainDic,x,y,yerr,infilename,outfilename,biasDic): print("\nFit a univariate spline to the probability means\n"), print("------------------------------------------------------------------------------------\n"), #print("baseline intra-chr probability: " + repr(baselineIntraChrProb)+ "\n"), # maximum residual allowed for spline is set to min(y)^2 splineError=min(y)*min(y) # use fitpack2 method -fit on the real x and y from equal occupancy binning ius = UnivariateSpline(x, y, s=splineError) #### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING ### NOW I DO THIS BY CALLING AN R function CALLED MONOREG ### This does the isotonic regression using option antitonic to make sure ### I get monotonically decreasing probabilites with increasion genomic distance tempMaxX=max(x) tempMinX=min(x) tempList=sorted([dis for dis in mainDic]) splineX=[] ### The below for loop will make sure nothing is out of range of [min(x) max(x)] ### Therefore everything will be within the range where the spline is defined for i in tempList: if tempMinX<=i and i<=tempMaxX: splineX.append(i) # END for splineY=ius(splineX) # R vector format rSplineX=ro.FloatVector(splineX) rSplineY=ro.FloatVector(splineY) rMonoReg=ro.r['monoreg'] # do the antitonic regression allRres=rMonoReg(rSplineX,rSplineY,type="antitonic") rNewSplineY=allRres[3] # convert data back to Python format newSplineY=[] diff=[] diffX=[] for i in range(len(rNewSplineY)): newSplineY.append(rNewSplineY[i]) if (splineY[i]-newSplineY[i]) > 0: diff.append(splineY[i]-newSplineY[i]) diffX.append(splineX[i]) # END if # END for ### Now newSplineY holds the monotonic contact probabilities residual =sum([i*i for i in (y - ius(x))]) ### Now plot the results plt.clf() fig = plt.figure() ax = fig.add_subplot(2,1,1) plt.title('Univariate spline fit to the output of equal occupancy binning. \n Residual= %e' % (residual),size='small') plt.plot([i/1000.0 for i in x], [i*100000 for i in y], 'ro', label="Means") #plt.plot([i/1000.0 for i in xi], [i*100000 for i in yi],'g-',label="Spline fit") plt.plot([i/1000.0 for i in splineX], [i*100000 for i in newSplineY],'g-',label="Spline fit") #plt.plot([i/1000.0 for i in x], [normalizedInterChrProb*100000 for i in x],'k-',label="Random intra-chromosomal") #plt.plot([i/1000.0 for i in x], [interChrProb*100000 for i in x],'b-',label="Inter-chromosomal") plt.ylabel('Probability (1e-5)') plt.xlabel('Genomic distance (kb)') plt.xlim([min(x)/1000.0,max(x)/1000.0]) ax.legend(loc="upper right") ax = fig.add_subplot(2,1,2) plt.loglog(splineX,newSplineY,'g-') #plt.loglog(xi, yi, 'g-') plt.loglog(x, y, 'r.') # Data #plt.loglog(x, [normalizedInterChrProb for i in x],'k-') #plt.loglog(x, [interChrProb for i in x],'b-') plt.ylabel('Probability (log scale)') plt.xlabel('Genomic distance (log scale)') #plt.xlim([20000,100000]) plt.xlim([min(x),max(x)]) plt.savefig(outfilename+'.res'+str(resolution)+'.png') sys.stderr.write("Plotting %s" % outfilename + ".png\n") # NOW write the calculated pvalues and corrected pvalues in a file infile =gzip.open(infilename, 'r') intraInRangeCount=0 intraOutOfRangeCount=0 intraVeryProximalCount=0 interCount=0 discardCount=0 print("lower bound on mid-range distances "+ repr(distLowThres) + ", upper bound on mid-range distances " + repr(distUpThres) +"\n"), p_vals=[] q_vals=[] for line in infile: words=line.rstrip().split() interxn=myUtils.Interaction([words[0], int(words[1]), words[2], int(words[3])]) interxn.setCount(float(words[4])) chr1=words[0] chr2=words[2] midPoint1=int(words[1]) midPoint2=int(words[3]) bias1=1.0; bias2=1.0; # assumes there is no bias to begin with # if the biasDic is not null sets the real bias values if len(biasDic)>0: if chr1 in biasDic and midPoint1 in biasDic[chr1]: bias1=biasDic[chr1][midPoint1] if chr2 in biasDic and midPoint2 in biasDic[chr2]: bias2=biasDic[chr2][midPoint2] if bias1==-1 or bias2==-1: p_val=1.0 discardCount+=1 elif interxn.type=='intra': if interxn.getType(distLowThres,distUpThres)=='intraInRange': # make sure the interaction distance is covered by the probability bins distToLookUp=max(interxn.distance,min(x)) distToLookUp=min(distToLookUp,max(x)) i=min(bisect.bisect_left(splineX, distToLookUp),len(splineX)-1) prior_p=newSplineY[i]*(bias1*bias2) # biases added in the picture p_val=scsp.bdtrc(interxn.hitCount-1,observedIntraInRangeSum,prior_p) intraInRangeCount +=1 elif interxn.getType(distLowThres,distUpThres)=='intraShort': prior_p=1.0 p_val=1.0 intraVeryProximalCount +=1 elif interxn.getType(distLowThres,distUpThres)=='intraLong': ## out of range distance ## use the prior of the baseline intra-chr interaction probability prior_p=baselineIntraChrProb*(bias1*bias2) # biases added in the picture p_val=scsp.bdtrc(interxn.hitCount-1,observedIntraAllSum,prior_p) intraOutOfRangeCount +=1 # END if else: # inter #prior_p=normalizedInterChrProb prior_p=interChrProb*(bias1*bias2) # biases added in the picture ############# THIS HAS TO BE interactionCount-1 ################## p_val=scsp.bdtrc(interxn.hitCount-1,observedInterAllSum,prior_p) interCount +=1 # p_vals.append(p_val) # END for infile.close() # Do the BH FDR correction q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount+possibleIntraAllCount) #q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleIntraInRangeCount) #print("possibleIntraInRangeCount " + repr(possibleIntraInRangeCount)+"\n"), infile =gzip.open(infilename, 'r') outfile =gzip.open(outfilename+'.res'+str(resolution)+'.significances.txt.gz', 'w') print("Writing p-values and q-values to file %s" % outfilename + ".significances.txt\n"), print("Number of pairs discarded due to bias not in range [0.5 2]\n"), outfile.write("chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n") count=0 for line in infile: words=line.rstrip().split() chrNo1=words[0] midPoint1=int(words[1]) chrNo2=words[2] midPoint2=int(words[3]) interactionCount=int(words[4]) p_val=p_vals[count] q_val=q_vals[count] #if chrNo1==chrNo2: # intra # interactionDistance=abs(midPoint1-midPoint2) # dist # if myUtils.in_range_check(interactionDistance,distLowThres,distUpThres): # outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val)) #else: # outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val)) outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val)) count+=1 # END for - printing pvals and qvals for all the interactions outfile.close() infile.close() return [splineX, newSplineY, residual] # from fit_Spline
def fit_Spline(mainDic, x, y, yerr, infilename, outfilename, biasDic): print("\nFit a univariate spline to the probability means\n"), print( "------------------------------------------------------------------------------------\n" ), #print("baseline intra-chr probability: " + repr(baselineIntraChrProb)+ "\n"), # maximum residual allowed for spline is set to min(y)^2 splineError = min(y) * min(y) # use fitpack2 method -fit on the real x and y from equal occupancy binning ius = UnivariateSpline(x, y, s=splineError) #### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING ### NOW I DO THIS BY CALLING AN R function CALLED MONOREG ### This does the isotonic regression using option antitonic to make sure ### I get monotonically decreasing probabilites with increasion genomic distance tempMaxX = max(x) tempMinX = min(x) tempList = sorted([dis for dis in mainDic]) splineX = [] ### The below for loop will make sure nothing is out of range of [min(x) max(x)] ### Therefore everything will be within the range where the spline is defined for i in tempList: if tempMinX <= i and i <= tempMaxX: splineX.append(i) # END for splineY = ius(splineX) # R vector format rSplineX = ro.FloatVector(splineX) rSplineY = ro.FloatVector(splineY) rMonoReg = ro.r['monoreg'] # do the antitonic regression allRres = rMonoReg(rSplineX, rSplineY, type="antitonic") rNewSplineY = allRres[3] # convert data back to Python format newSplineY = [] diff = [] diffX = [] for i in range(len(rNewSplineY)): newSplineY.append(rNewSplineY[i]) if (splineY[i] - newSplineY[i]) > 0: diff.append(splineY[i] - newSplineY[i]) diffX.append(splineX[i]) # END if # END for ### Now newSplineY holds the monotonic contact probabilities residual = sum([i * i for i in (y - ius(x))]) ### Now plot the results plt.clf() fig = plt.figure() ax = fig.add_subplot(2, 1, 1) plt.title( 'Univariate spline fit to the output of equal occupancy binning. \n Residual= %e' % (residual), size='small') plt.plot([i / 1000.0 for i in x], [i * 100000 for i in y], 'ro', label="Means") #plt.plot([i/1000.0 for i in xi], [i*100000 for i in yi],'g-',label="Spline fit") plt.plot([i / 1000.0 for i in splineX], [i * 100000 for i in newSplineY], 'g-', label="Spline fit") #plt.plot([i/1000.0 for i in x], [normalizedInterChrProb*100000 for i in x],'k-',label="Random intra-chromosomal") #plt.plot([i/1000.0 for i in x], [interChrProb*100000 for i in x],'b-',label="Inter-chromosomal") plt.ylabel('Probability (1e-5)') plt.xlabel('Genomic distance (kb)') plt.xlim([min(x) / 1000.0, max(x) / 1000.0]) ax.legend(loc="upper right") ax = fig.add_subplot(2, 1, 2) plt.loglog(splineX, newSplineY, 'g-') #plt.loglog(xi, yi, 'g-') plt.loglog(x, y, 'r.') # Data #plt.loglog(x, [normalizedInterChrProb for i in x],'k-') #plt.loglog(x, [interChrProb for i in x],'b-') plt.ylabel('Probability (log scale)') plt.xlabel('Genomic distance (log scale)') #plt.xlim([20000,100000]) plt.xlim([min(x), max(x)]) plt.savefig(outfilename + '.res' + str(resolution) + '.png') sys.stderr.write("Plotting %s" % outfilename + ".png\n") # NOW write the calculated pvalues and corrected pvalues in a file infile = gzip.open(infilename, 'r') intraInRangeCount = 0 intraOutOfRangeCount = 0 intraVeryProximalCount = 0 interCount = 0 discardCount = 0 print("lower bound on mid-range distances " + repr(distLowThres) + ", upper bound on mid-range distances " + repr(distUpThres) + "\n"), p_vals = [] q_vals = [] for line in infile: words = line.rstrip().split() interxn = myUtils.Interaction( [words[0], int(words[1]), words[2], int(words[3])]) interxn.setCount(float(words[4])) chr1 = words[0] chr2 = words[2] midPoint1 = int(words[1]) midPoint2 = int(words[3]) bias1 = 1.0 bias2 = 1.0 # assumes there is no bias to begin with # if the biasDic is not null sets the real bias values if len(biasDic) > 0: if chr1 in biasDic and midPoint1 in biasDic[chr1]: bias1 = biasDic[chr1][midPoint1] if chr2 in biasDic and midPoint2 in biasDic[chr2]: bias2 = biasDic[chr2][midPoint2] if bias1 == -1 or bias2 == -1: p_val = 1.0 discardCount += 1 elif interxn.type == 'intra': if interxn.getType(distLowThres, distUpThres) == 'intraInRange': # make sure the interaction distance is covered by the probability bins distToLookUp = max(interxn.distance, min(x)) distToLookUp = min(distToLookUp, max(x)) i = min(bisect.bisect_left(splineX, distToLookUp), len(splineX) - 1) prior_p = newSplineY[i] * (bias1 * bias2 ) # biases added in the picture p_val = scsp.bdtrc(interxn.hitCount - 1, observedIntraInRangeSum, prior_p) intraInRangeCount += 1 elif interxn.getType(distLowThres, distUpThres) == 'intraShort': prior_p = 1.0 p_val = 1.0 intraVeryProximalCount += 1 elif interxn.getType(distLowThres, distUpThres) == 'intraLong': ## out of range distance ## use the prior of the baseline intra-chr interaction probability prior_p = baselineIntraChrProb * ( bias1 * bias2) # biases added in the picture p_val = scsp.bdtrc(interxn.hitCount - 1, observedIntraAllSum, prior_p) intraOutOfRangeCount += 1 # END if else: # inter #prior_p=normalizedInterChrProb prior_p = interChrProb * (bias1 * bias2 ) # biases added in the picture ############# THIS HAS TO BE interactionCount-1 ################## p_val = scsp.bdtrc(interxn.hitCount - 1, observedInterAllSum, prior_p) interCount += 1 # p_vals.append(p_val) # END for infile.close() # Do the BH FDR correction q_vals = myStats.benjamini_hochberg_correction( p_vals, possibleInterAllCount + possibleIntraAllCount) #q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleIntraInRangeCount) #print("possibleIntraInRangeCount " + repr(possibleIntraInRangeCount)+"\n"), infile = gzip.open(infilename, 'r') outfile = gzip.open( outfilename + '.res' + str(resolution) + '.significances.txt.gz', 'w') print("Writing p-values and q-values to file %s" % outfilename + ".significances.txt\n"), print("Number of pairs discarded due to bias not in range [0.5 2]\n"), outfile.write( "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n" ) count = 0 for line in infile: words = line.rstrip().split() chrNo1 = words[0] midPoint1 = int(words[1]) chrNo2 = words[2] midPoint2 = int(words[3]) interactionCount = int(words[4]) p_val = p_vals[count] q_val = q_vals[count] #if chrNo1==chrNo2: # intra # interactionDistance=abs(midPoint1-midPoint2) # dist # if myUtils.in_range_check(interactionDistance,distLowThres,distUpThres): # outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val)) #else: # outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val)) outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1), midPoint1, str(chrNo2), midPoint2, interactionCount, p_val, q_val)) count += 1 # END for - printing pvals and qvals for all the interactions outfile.close() infile.close() return [splineX, newSplineY, residual] # from fit_Spline
def fit_spline(mainDic, x, y, yerr, infilename, outfilename, biasDic, resolution, min_dist, max_dist, verbose): if verbose: print("\nFit a univariate spline to the probability means\n"), print( "------------------------------------------------------------------------------------\n" ), # maximum residual allowed for spline is set to min(y)^2 splineError = min(y)**2 # use fitpack2 method -fit on the real x and y from equal occupancy binning ius = UnivariateSpline(x, y, s=splineError) #### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING ### NOW I DO THIS BY CALLING A SKLEARN ISOTONIC REGRESSION ### This does the isotonic regression using option antitonic to make sure ### I get monotonically decreasing probabilites with increasion genomic distance min_x, max_x = min(x), max(x) tempList = sorted([dis for dis in mainDic]) splineX = [] ### The below for loop will make sure nothing is out of range of [min(x) max(x)] ### Therefore everything will be within the range where the spline is defined for i in tempList: if min_x <= i <= max_x: splineX.append(i) splineY = ius(splineX) ir = IsotonicRegression(increasing=False) rNewSplineY = ir.fit_transform(splineX, splineY) newSplineY = [] diff = [] diffX = [] for i in range(len(rNewSplineY)): newSplineY.append(rNewSplineY[i]) if (splineY[i] - newSplineY[i]) > 0: diff.append(splineY[i] - newSplineY[i]) diffX.append(splineX[i]) ### Now newSplineY holds the monotonic contact probabilities residual = sum([i * i for i in (y - ius(x))]) ### Now plot the results plt.clf() fig = plt.figure() ax = fig.add_subplot(2, 1, 1) plt.title( 'Univariate spline fit to the output of equal occupancy binning. \n Residual= %e' % (residual), size='small') plt.plot([i / 1000.0 for i in x], [i * 100000 for i in y], 'ro', label="Means") plt.plot([i / 1000.0 for i in splineX], [i * 100000 for i in newSplineY], 'g-', label="Spline fit") plt.ylabel('Probability (1e-5)') plt.xlabel('Genomic distance (kb)') plt.xlim([min_x / 1000.0, max_x / 1000.0]) ax.legend(loc="upper right") ax = fig.add_subplot(2, 1, 2) plt.loglog(splineX, newSplineY, 'g-') plt.loglog(x, y, 'r.') # Data plt.ylabel('Probability (log scale)') plt.xlabel('Genomic distance (log scale)') plt.xlim([min_x, max_x]) plt.savefig(outfilename + '.res' + str(resolution) + '.png') sys.stderr.write("Plotting %s" % outfilename + ".png\n") # NOW write the calculated pvalues and corrected pvalues in a file intraInRangeCount = 0 intraOutOfRangeCount = 0 intraVeryProximalCount = 0 interCount = 0 discardCount = 0 if verbose: print("lower bound on mid-range distances " + repr(min_dist) + ", upper bound on mid-range distances " + repr(max_dist) + "\n"), with gzip.open(infilename, 'r') as infile: with gzip.open( '{}.res{}.significances.txt.gz'.format(outfilename, resolution), 'w') as outfile: outfile.write( "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n" ) for line in infile: chr1, mid1, chr2, mid2, contactCount = line.rstrip().split() mid1, mid2, contactCount = int(mid1), int(mid2), int( contactCount) distance = mid2 - mid1 bias1 = 1.0 bias2 = 1.0 # assumes there is no bias to begin with # if the biasDic is not null sets the real bias values if len(biasDic) > 0: if chr1 in biasDic and mid1 in biasDic[chr1]: bias1 = biasDic[chr1][mid1] if chr2 in biasDic and mid2 in biasDic[chr2]: bias2 = biasDic[chr2][mid2] if min_dist <= distance <= max_dist: # make sure the interaction distance is covered by the probability bins distToLookUp = min(max(distance, min_x), max_x) i = min(bisect.bisect_left(splineX, distToLookUp), len(splineX) - 1) prior_p = newSplineY[i] * (bias1 * bias2 ) # biases added in the picture p_val = scsp.bdtrc(contactCount - 1, observedIntraInRangeSum, prior_p) if p_val <= 1: outfile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( chr1, mid1, chr2, mid2, contactCount, p_val, -1)) return splineX, newSplineY, residual
def test_value(self): val = sc.bdtrc(0, 1, 0.5) assert_allclose(val, 0.5)
def _sf(self, x, n, p): k = floor(x) return special.bdtrc(k, n, p)
def test_sum_is_one(self): val = sc.bdtrc([0, 1, 2], 2, 0.5) assert_array_equal(val, [0.75, 0.25, 0.0])
def fit_Spline(mainDic, x, y, yerr, infilename, outfilename, biasDic, outliersline, outliersdist, observedIntraInRangeSum, possibleIntraInRangeCount, possibleInterAllCount, observedIntraAllSum, observedInterAllSum, resolution, passNo): with open(logfile, 'a') as log: log.write("\nFitting a univariate spline to the probability means\n"), log.write( "------------------------------------------------------------------------------------\n" ), splineX = None newSplineY = None residual = None FDRx = None FDRy = None if not interOnly: if outliersdist != None: y = [f for _, f in sorted(zip(x, y), key=lambda pair: pair[0])] x.sort() for i in range(1, len(x)): if x[i] <= x[i - 1]: print( "ERROR in spline fitting. Distances do not decrease across bins. Ensure interaction file is correct." ) print("Avg. distance of bin(i-1)... %s" % x[i - 1]) print("Avg. distance of bin(i)... %s" % x[i]) sys.exit(2) # maximum residual allowed for spline is set to min(y)^2 splineError = min(y) * min(y) # use fitpack2 method -fit on the real x and y from equal occupancy binning ius = UnivariateSpline(x, y, s=splineError) tempMaxX = max(x) tempMinX = min(x) tempList = sorted([dis for dis in mainDic]) splineX = [] ### The below for loop will make sure nothing is out of range of [min(x) max(x)] ### Therefore everything will be within the range where the spline is defined for i in tempList: if tempMinX <= i <= tempMaxX: splineX.append(i) splineY = ius(splineX) #print(splineY) #print(yerr) ir = IsotonicRegression(increasing=False) newSplineY = ir.fit_transform(splineX, splineY) #print(newSplineY) residual = sum([i * i for i in (y - ius(x))]) if visual == True: xi = np.linspace(min(x), max(x), 5 * len(x)) yi = ius(xi) print("Plotting %s" % (outfilename + ".png")) plt.clf() fig = plt.figure() ax = fig.add_subplot(2, 1, 1) plt.plot(myUtils.scale_a_list(splineX, toKb), myUtils.scale_a_list(newSplineY, toProb), 'g-', label="spline-" + str(passNo), linewidth=2) plt.errorbar(myUtils.scale_a_list(x, toKb), myUtils.scale_a_list(y, toProb), myUtils.scale_a_list(yerr, toProb), fmt='r.', label="Mean with std. error", linewidth=2) #plt.ylabel('Contact probability (x10$^{-5}$)',fontsize='large') #plt.xlabel('Genomic distance (kb)',fontsize='large') plt.ylabel('Contact probability (x10$^{-5}$)') plt.xlabel('Genomic distance (kb)') if distLowThres > 0 and distUpThres < float("inf"): plt.xlim( myUtils.scale_a_list([distLowThres, distUpThres], toKb)) plt.gca().yaxis.set_major_locator(MaxNLocator(nbins=3, prune=None)) ax.legend(loc="upper right") ax = fig.add_subplot(2, 1, 2) plt.loglog(splineX, newSplineY, 'g-') plt.errorbar(x, y, yerr=yerr, fmt='r.') # Data if distLowThres > 0 and distUpThres < float("inf"): plt.xlim([distLowThres, distUpThres]) plt.ylabel('Contact probability (log-scale)') plt.xlabel('Genomic distance (log-scale)') plt.savefig(outfilename + '.png') # NOW write the calculated pvalues and corrected pvalues in a file infile = gzip.open(infilename, 'rt') intraInRangeCount = 0 intraOutOfRangeCount = 0 intraVeryProximalCount = 0 interCount = 0 discardCount = 0 p_vals = [] q_vals = [] biasl = [] biasr = [] for line in infile: ch1, mid1, ch2, mid2, contactCount = line.rstrip().split() contactCount = float(contactCount) interxn = myUtils.Interaction([ch1, int(mid1), ch2, int(mid2)]) interxn.setCount(contactCount) mid1 = int(mid1) mid2 = int(mid2) interactionType = interxn.getType(distLowThres, distUpThres) bias1 = 1.0 bias2 = 1.0 # assumes there is no bias to begin with # if the biasDic is not null sets the real bias values if biasDic: if ch1 in biasDic and mid1 in biasDic[ch1]: bias1 = biasDic[ch1][mid1] if ch2 in biasDic and mid2 in biasDic[ch2]: bias2 = biasDic[ch2][mid2] biasl.append(bias1) biasr.append(bias2) if (bias1 < 0 or bias2 < 0) and interactionType != 'inter': prior_p = 1.0 p_val = 1.0 discardCount += 1 elif interactionType == 'intraInRange' and not interOnly: distToLookUp = max(interxn.getDistance(), min(x)) distToLookUp = min(distToLookUp, max(x)) i = min(bisect.bisect_left(splineX, distToLookUp), len(splineX) - 1) prior_p = newSplineY[i] * (bias1 * bias2) p_val = scsp.bdtrc(interxn.getCount() - 1, observedIntraInRangeSum, prior_p) intraInRangeCount += 1 elif interactionType == 'intraShort' and not interOnly: prior_p = 1.0 p_val = 1.0 intraVeryProximalCount += 1 elif interactionType == 'intraLong' and not interOnly: prior_p = 1.0 #p_val=scsp.bdtrc(interxn.getCount()-1, observedIntraAllSum,prior_p) ##RUNBY p_val = 1.0 intraOutOfRangeCount += 1 else: if allReg or interOnly: prior_p = interChrProb * (bias1 * bias2) p_val = scsp.bdtrc(interxn.getCount() - 1, observedInterAllSum, prior_p) interCount += 1 else: p_val = 1.0 #p_vals.append(p_val) p_vals.append(p_val) infile.close() outlierThres = 0 # Do the BH FDR correction if allReg: outlierThres = 1.0 / (possibleIntraInRangeCount + possibleInterAllCount) q_vals = myStats.benjamini_hochberg_correction( p_vals, possibleInterAllCount + possibleIntraInRangeCount) elif interOnly and not allReg: outlierThres = 1.0 / possibleInterAllCount q_vals = myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount) else: outlierThres = 1.0 / possibleIntraInRangeCount q_vals = myStats.benjamini_hochberg_correction( p_vals, possibleIntraInRangeCount) print("Outlier threshold is... %s" % (outlierThres)) #now we write the values back to the file infile = gzip.open(infilename, 'rt') if resolution: outfile = gzip.open( outfilename + '.res' + str(resolution) + '.significances.txt.gz', 'wt') else: outfile = gzip.open(outfilename + '.significances.txt.gz', 'wt') print("Writing p-values and q-values to file %s" % (outfilename + ".significances.txt")) outfile.write( "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\tbias1\tbias2\n" ) count = 0 for line in infile: words = line.rstrip().split() chr1 = words[0] midPoint1 = int(words[1]) chr2 = words[2] midPoint2 = int(words[3]) interactionCount = float(words[4]) p_val = p_vals[count] q_val = q_vals[count] bias1 = biasl[count] bias2 = biasr[count] if (allReg or interOnly) and chr1 != chr2: outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" % (str(chr1), midPoint1, str(chr2), midPoint2, interactionCount, p_val, q_val, bias1, bias2)) if (allReg or not interOnly) and chr1 == chr2: interactionDistance = abs(midPoint1 - midPoint2) if myUtils.in_range_check(interactionDistance, distLowThres, distUpThres): outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" % (str(chr1), midPoint1, str(chr2), midPoint2, interactionCount, p_val, q_val, bias1, bias2)) if p_val < outlierThres: outliersline.add(count) outliersdist.add(abs(midPoint1 - midPoint2)) count += 1 outfile.close() infile.close() if visual == True: print("Plotting q-values to file %s" % outfilename + ".qplot.png") minFDR = 0.0 maxFDR = 0.05 increment = 0.001 FDRx, FDRy = plot_qvalues(q_vals, minFDR, maxFDR, increment, outfilename + ".qplot") with open(logfile, 'a') as log: log.write("Spline successfully fit\n"), log.write("\n"), log.write("\n"), return [ splineX, newSplineY, residual, outliersline, outliersdist, FDRx, FDRy ] # from fit_Spline
def test_rounding(self): double_val = sc.bdtrc([0.1, 1.1, 2.1], 2, 0.5) int_val = sc.bdtrc([0, 1, 2], 2, 0.5) assert_array_equal(double_val, int_val)
def fit_spline(mainDic, x, y, yerr, infilename, outfilename, biasDic, resolution, min_dist, max_dist): print("\nFit a univariate spline to the probability means\n"), print("------------------------------------------------------------------------------------\n"), # maximum residual allowed for spline is set to min(y)^2 splineError = min(y)**2 # use fitpack2 method -fit on the real x and y from equal occupancy binning ius = UnivariateSpline(x, y, s=splineError) #### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING ### NOW I DO THIS BY CALLING A SKLEARN ISOTONIC REGRESSION ### This does the isotonic regression using option antitonic to make sure ### I get monotonically decreasing probabilites with increasion genomic distance min_x, max_x = min(x), max(x) tempList=sorted([dis for dis in mainDic]) splineX=[] ### The below for loop will make sure nothing is out of range of [min(x) max(x)] ### Therefore everything will be within the range where the spline is defined for i in tempList: if min_x <= i <= max_x: splineX.append(i) splineY=ius(splineX) ir = IsotonicRegression(increasing=False) rNewSplineY = ir.fit_transform(splineX, splineY) newSplineY=[] diff=[] diffX=[] for i in range(len(rNewSplineY)): newSplineY.append(rNewSplineY[i]) if (splineY[i]-newSplineY[i]) > 0: diff.append(splineY[i]-newSplineY[i]) diffX.append(splineX[i]) ### Now newSplineY holds the monotonic contact probabilities residual = sum([i*i for i in (y - ius(x))]) ### Now plot the results plt.clf() fig = plt.figure() ax = fig.add_subplot(2,1,1) plt.title('Univariate spline fit to the output of equal occupancy binning. \n Residual= %e' % (residual),size='small') plt.plot([i/1000.0 for i in x], [i*100000 for i in y], 'ro', label="Means") plt.plot([i/1000.0 for i in splineX], [i*100000 for i in newSplineY],'g-',label="Spline fit") plt.ylabel('Probability (1e-5)') plt.xlabel('Genomic distance (kb)') plt.xlim([min_x/1000.0, max_x/1000.0]) ax.legend(loc="upper right") ax = fig.add_subplot(2,1,2) plt.loglog(splineX,newSplineY,'g-') plt.loglog(x, y, 'r.') # Data plt.ylabel('Probability (log scale)') plt.xlabel('Genomic distance (log scale)') plt.xlim([min_x, max_x]) plt.savefig(outfilename+'.res'+str(resolution)+'.png') sys.stderr.write("Plotting %s" % outfilename + ".png\n") # NOW write the calculated pvalues and corrected pvalues in a file intraInRangeCount = 0 intraOutOfRangeCount = 0 intraVeryProximalCount = 0 interCount = 0 discardCount = 0 print("lower bound on mid-range distances "+ repr(min_dist) + ", upper bound on mid-range distances " + repr(max_dist) +"\n"), p_vals=[] q_vals=[] with gzip.open(infilename, 'r') as infile: for line in infile: chr1, mid1, chr2, mid2, contactCount = line.rstrip().split() mid1 = int(mid1) mid2 = int(mid2) contactCount = int(contactCount) distance = mid2 - mid1 bias1 = 1.0; bias2 = 1.0; # assumes there is no bias to begin with # if the biasDic is not null sets the real bias values if len(biasDic)>0: if chr1 in biasDic and mid1 in biasDic[chr1]: bias1=biasDic[chr1][mid1] if chr2 in biasDic and mid2 in biasDic[chr2]: bias2=biasDic[chr2][mid2] if bias1 == -1 or bias2 == -1: p_val = 1.0 discardCount += 1 elif chr1 == chr2: if (min_dist==-1 or (min_dist>-1 and distance >min_dist)) and\ (max_dist==-1 or (max_dist>-1 and distance <= max_dist)): # make sure the interaction distance is covered by the probability bins distToLookUp = min(max(distance, min_x), max_x) i = min(bisect.bisect_left(splineX, distToLookUp),len(splineX)-1) prior_p = newSplineY[i] * (bias1 * bias2) # biases added in the picture p_val = scsp.bdtrc(contactCount-1, observedIntraInRangeSum, prior_p) intraInRangeCount +=1 elif (min_dist > -1 and distance <= min_dist): prior_p = 1.0 p_val = 1.0 intraVeryProximalCount += 1 elif (max_dist>-1 and distance > max_dist): ## out of range distance ## use the prior of the baseline intra-chr interaction probability prior_p = baselineIntraChrProb * (bias1 * bias2) # biases added in the picture p_val = scsp.bdtrc(contactCount-1, observedIntraAllSum, prior_p) intraOutOfRangeCount += 1 else: prior_p = interChrProb*(bias1*bias2) # biases added in the picture ############# THIS HAS TO BE interactionCount-1 ################## p_val = scsp.bdtrc(contactCount-1, observedInterAllSum, prior_p) interCount += 1 p_vals.append(p_val) # Do the BH FDR correction q_vals = benjamini_hochberg_correction(p_vals, possibleInterAllCount+possibleIntraAllCount) print("Writing p-values and q-values to file %s" % outfilename + ".significances.txt\n"), print("Number of pairs discarded due to bias not in range [0.5 2]\n"), with gzip.open(infilename, 'r') as infile: with gzip.open(outfilename+'.res'+str(resolution)+'.significances.txt.gz', 'w') as outfile: outfile.write("chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n") for i, line in enumerate(infile): p_val, q_val = p_vals[i], q_vals[i] chr1, mid1, chr2, mid2, contactCount = line.rstrip().split() outfile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(chr1, mid1, chr2, mid2, contactCount, p_val, q_val)) return splineX, newSplineY, residual