def Glower(theta,y,n,j): y=np.asarray(y).copy() n=np.asarray(n).copy() if(j==0): return (binom.sf(k=y[j]-1,n=n[j],p=theta)) return (binom.sf(k=y[j],n=n[j],p=theta)+binom.pmf(k=y[j],n=n[j],p=theta)*Glower(theta=theta,y=y,n=n,j=j-1))
def binom_test_v2(x, n=None, p=0.5, alternative='two-sided'): n = np.int_(n) if (p > 1.0) or (p < 0.0): raise ValueError("p must be in range [0,1]") if alternative not in ('two-sided', 'less', 'greater'): raise ValueError( "alternative not recognized should be 'two-sided', 'less' or 'greater'" ) if alternative == 'less': pval = binom.cdf(x, n, p) return pval if alternative == 'greater': pval = binom.sf(x - 1, n, p) return pval d = binom.pmf(x, n, p) rerr = 1 + 1e-7 a_fn = lambda x1: binom.pmf(x1, n, p) if x == p * n: pval = 1. elif x < p * n: y = n - binary_search(a_fn, d * rerr, np.ceil(p * n), n) + 1 pval = (binom.cdf(x, n, p) + binom.sf(n - y, n, p)) else: y = binary_search(a_fn, d * rerr, 0, np.floor(p * n) + 1, True) + 1 pval = (binom.cdf(y - 1, n, p) + binom.sf(x - 1, n, p)) return min(1.0, pval)
def alpha_on_determinist_compound_closed_form(lmb=10.0,t1=10,\ t2=10,l=3,verbose=False): alpha_hats = np.arange(0.00001, 1.0, 0.01) #alpha_hats = np.array([0.05]) p = t2 / (t1 + t2) alphas = np.zeros(len(alpha_hats)) k = int(lmb * (t1 + t2)) alpha_dels = np.ones(len(alpha_hats)) total_pois_mass = 0.0 #TODO: Replace this with other condition. while sum(alpha_dels) > 1e-7 * len(alpha_dels): isfs = binom.isf(alpha_hats, k * l, p) cdfs = binom.sf((isfs / l).astype(int), k, p) pmf = poisson.pmf(k, lmb * (t1 + t2)) total_pois_mass += pmf alpha_dels = pmf * cdfs alphas += alpha_dels if verbose and (k - int(lmb * (t1 + t2))) % 100 == 0: print("k="+str(k-int(lmb*(t1+t2))) + " alpha_dels sum: "\ + str(sum(alpha_dels))) k += 1 if verbose: print("Completed first loop") k = int(lmb * (t1 + t2)) - 1 while k >= 0: isfs = binom.isf(alpha_hats, k * l, p) cdfs = binom.sf((isfs / l).astype(int), k, p) pmf = poisson.pmf(k, lmb * (t1 + t2)) total_pois_mass += pmf alpha_dels = pmf * cdfs if np.isnan(sum(alpha_dels)): print(k) alphas += alpha_dels k -= 1 return alphas, alpha_hats, total_pois_mass
def expctd_cond_gr_m(m, n, p): if m > int(n / 2): return sum(binom.pmf(np.arange(m+1,n+1),n,p)\ /binom.sf(m,n,p)*np.arange(m+1,n+1)) else: return n*p/binom.sf(m,n,p)-binom.cdf(m,n,p)\ /binom.sf(m,n,p)*expctd_cond_leq_m(m,n,p)
def checktheory(thres, n, ne, p, te, s): # calculate failure probability for a certain number of ones in se pfaildict = {} for se in range(0, n + 1): tmp = [binom.sf((thres + i + se) / 2, se, p=0.5) * s[i] for i in s] tmp2 = [binom.sf((thres + 1 + i + se) / 2, se, p=0.5) * s[i] for i in s] pfail = 1.5 * sum(tmp) + 0.5 * sum(tmp2) pfaildict[se] = pfail # set everything to zero fail = 0 fail2 = {} for te1 in te: fail2[te1] = 0 # loop over all norm values for l1, l2 in tqdm(itertools.combinations_with_replacement(range(0, n + 1), 2), leave=False, total=n * (n + 1) / 2): # probability of a certain norm pl1 = binom.pmf(l1, n=n, p=p) pl2 = binom.pmf(l2, n=n, p=p) pl = pl1 * pl2 if l1 != l2: pl *= 2 # skip if probability is too small if pl < 2**-200: continue # calculate the probability of a failure failtmp = 0 # loop over all possible number of nonzero elements in se for se1 in range(max(0, l1 + l2 - n), min(l1, l2) + 1): # probability of number of nonzero elements in se pse = hypergeom.pmf(k=se1, M=n, n=l1, N=l2) # probability of failure for a certain se1 pfail = pfaildict[se1] # weighted average share failtmp += pse * pfail # for new model, take error correction into account fail += pl * failtmp for te1 in te: fail2[te1] += pl * LACprob(failtmp, ne, te1) new = [] old = [] for te1 in te: # for old model, take error correction into account old.append(LACprob(fail, ne, te1)) new.append(fail2[te1]) return new, old
def checktheory(thres, n, ne, p, te, s): # calculate failure probability for a certain number of ones in se pfaildict = {} for se in range(0, n + 1): tmp_0 = [binom.sf((thres + i + se) / 2, se, p=0.5) * s[i] for i in s] tmp_1_1 = [ binom.sf((thres + i + se + 1) / 2, se, p=0.5) * s[i] for i in s ] tmp_1_2 = [ binom.sf((thres + i + se - 1) / 2, se, p=0.5) * s[i] for i in s ] pfail = sum(tmp_0) + (sum(tmp_1_1) + sum(tmp_1_2)) * 0.5 pfaildict[se] = pfail # set everything to zero fail = 0 # loop over all norm values for l1, l2 in tqdm(itertools.combinations_with_replacement( range(0, n + 1), 2), leave=False, total=n * (n + 1) / 2): # probability of a certain norm # pl = P[||s||2] * P[||c||2] pl1 = binom.pmf(l1, n=n, p=p) pl2 = binom.pmf(l2, n=n, p=p) pl = pl1 * pl2 if l1 != l2: pl *= 2 # skip if probability is too small if pl < 2**-100: ## 200 continue # calculate the probability of a failure failtmp = 0 # loop over all possible number of nonzero elements in se for se1 in range(max(0, l1 + l2 - n), min(l1, l2) + 1): # probability of number of nonzero elements in se pse = hypergeom.pmf(k=se1, M=n, n=l1, N=l2) # probability of failure for a certain se1 pfail = pfaildict[se1] # weighted average share failtmp += pse * pfail * 0.5 # failtmp = pb # for new model, take error correction into account fail += pl * Binom_prob(failtmp, ne, te) # Binom_prob : 1 - Binom(d, lm, pb) return fail
def pbinom(x, size=1, prob=0.5, lowertail=True, log=False): """ ============================================================================ pbinom() ============================================================================ The cumulative distribution function for the binomial distribution. You provide a value along the binomial distribution (eg x=3) or array of values, and it returns what proportion of values lie below it (the quantile) Alternatively, if you select lowertail=False, it returns the proportion of values that are above it. USAGE: dbinom(x, size, prob=0.5, log=False) pbinom(x, size, prob=0.5, lowertail=True, log=False) qbinom(q, size, prob=0.5, lowertail=True) rbinom(n=1, size=1, prob=0.5) :param x: int. or array of ints. The values along the distribution. :param size: int. Number of trials :param prob: float. Probability of a success :param lowertail bool. are you interested in what proportion of values lie beneath x? :param log: bool. take the log? :return: an array of quantiles() corresponding to the values in x ============================================================================ """ if lowertail and not log: return binom.cdf(x, n=size, p=prob) elif not lowertail and not log: return binom.sf(x, n=size, p=prob) elif lowertail and log: return binom.logcdf(x, n=size, p=prob) else: return binom.logsf(x, n=size, p=prob)
def calc_sf_all(v, n, p, prev_best_score=False): sf_values = -np.log10(binom.sf(v - 1, n, p)) sf_values[np.isnan(sf_values)] = 0 sf_values[np.isinf(sf_values)] = (prev_best_score if prev_best_score is not False else max(sf_values[~np.isinf(sf_values)]) * 2) return sf_values
def check(N, p): global numfails, numchecks, mu, sigma2 H = NeuronGroup(1, 'v:1', threshold='False', name='H') G = NeuronGroup(N, 'v:1', threshold='False', name='G') S = Synapses(H, G, on_pre='v+=w', name='S') S.connect(p=p) m = len(S) low, high = binom.interval(alpha, N, p) if p==0: low = high = 0 elif p==1: low = high = N else: i = diff(S.j[:]) i = i[i<isi_max[p]] b = bincount(i, minlength=isi_max[p])[:isi_max[p]] if b[0]: print 'Major error: repeated indices for N=%d, p=%.3f' % (N, p) raise ValueError("Repeated indices") isi[p] += b num_isi[p] += sum(b) q = binom.cdf(low-0.1, N, p)+binom.sf(high+0.1, N, p) mu += q sigma2 += q*(1-q) numchecks += 1 if m<low or m>high: numfails += 1 return True else: return False
def gather_stats_binom_control_muts(t, c, seq, treat_nm, nm, decisions): ''' Filter treatment mutations that can be explained by control freq. In practice, this step is most effective for control mutations with relatively high frequency => relatively high variance Considers all events that occur (fq > 0%) in both control and treatment data ''' fpr_threshold_try1 = 0.10 for jdx, ref_nt in enumerate(seq): c_tot = sum(c[jdx]) t_tot = sum(t[jdx]) for kdx in range(len(t[jdx])): if kdx == nt_to_idx[ref_nt] or t[jdx][kdx] == 0: continue c_fq = c[jdx][kdx] / c_tot t_fq = t[jdx][kdx] / t_tot pval = binom.sf(t[jdx][kdx] - 1, t_tot, c_fq) if c_fq > 0: decisions['obs_nt'].append(nts[kdx]) decisions['ref_nt'].append(ref_nt) decisions['c_fq'].append(c_fq) decisions['c_ct'].append(c[jdx][kdx]) decisions['t_fq'].append(t_fq) decisions['t_ct'].append(t[jdx][kdx]) decisions['c_tot'].append(c_tot) decisions['t_tot'].append(t_tot) decisions['idx'].append(jdx) decisions['pos'].append(_data.idx_to_pos(jdx, treat_nm)) decisions['pval'].append(pval) decisions['nm'].append(nm) return
def calculate_p_value(ex_bg, ex_mut, in_bg, in_mut, p_bg=0.016): pval = 1 if in_bg >= 10 and ex_bg >= 10 and ex_mut >= 0: p = np.divide(in_mut, in_bg) p = max(p, p_bg) pval = 1 - binom.sf(k=ex_mut, n=ex_bg, p=p) return pval
def binomial_test(n, N, P): """Perform binomial test on the observed n being higher than expected. Specifically, N residues are at risk and of those there are n mutations occurred at the Np residues of interest. Given the background probability of a mutation at a specific residue, the p-value is calculated as the probability of observing n or greater mutations. Since N is large and n is small, it is computationally more efficient to take 1 - Pr(i<=n-1). Parameters ---------- n : int number of observed mutations N : int number of residues at risk P : float background probability that a mutation would occur at a single residue Returns ------- pval : np.array p-value for binomial test """ if n <= 0: return 1.0 pval = binom.sf(n - 1, N, P) return pval
def calculate_parallelism_qvalues(gene_statistics): gene_names = [] Ls = [] ns = [] expected_ns = [] for gene_name in gene_statistics.keys(): gene_names.append(gene_name) ns.append(gene_statistics[gene_name]['observed']) expected_ns.append(gene_statistics[gene_name]['expected']) ns = numpy.array(ns) expected_ns = numpy.array(expected_ns) ntot = ns.sum() ps = expected_ns / ntot ntots = ntot * numpy.ones_like(ps) pvalues = binom.sf(ns - 0.5, ntots, ps) qvalues = stats_utils.calculate_qvalues(pvalues) qvalue_map = {gene_name: q for gene_name, q in zip(gene_names, qvalues)} pvalue_map = {gene_name: p for gene_name, p in zip(gene_names, pvalues)} return qvalue_map, pvalue_map
def l_pod_k_of_n_av(l, n, k, p, q, pod_less_mcs=0): """ See here: https://math.stackexchange.com/questions/3825082/reliability-of-an-l-pod-k-of-n-system?noredirect=1#comment7888736_3825082 Answer matches with legacy method: pffc_resiliency(3,7,.9,.8,1)==l_pod_k_of_n(3,7,4,0.9,0.8) args: pod_less_mcs: This is a parameter for the AIR formula. Machines for whom the pod is garunteed to work. """ ceil = np.ceil(n / l) flr = ceil - 1 ## Num of hero and joe pods. # A hero pod has one more machine # than a joe pod. h = int(n - l * flr) j = int(l * ceil - n) prob = 0.0 ## All combinations of hero and joe pod availability. for h1 in range(h + 1): for j1 in range(j + 1): #Num of available machines with these many pods. nn = h1 * ceil + j1 * flr + pod_less_mcs if nn >= k: ## We get a k of nn system among the machines. prob += binom.pmf(h1, h, q) * binom.pmf(j1, j, q) * binom.sf( k - 1, nn, p) return prob
def reducer(self, key, values): friend = key[0] fu = key[1] ru = key[2] hist = {} # asumed to be very small relative to fs fs = [] # hopefully not too too big, maybe in the hundreds of thousands. for follower, ruu in values: fs.append((follower,ruu)) if ruu in hist: hist[ruu] = hist[ruu] + 1 else: hist[ruu] = 1 cdf = {} for k,v in hist.iteritems(): cdf[k] = v for k2, v2 in hist.iteritems(): if k2 < k: cdf[k] = cdf[k] + v2 for follower, ruu in fs: edgeprob = 0 if not fu == 0: edgeprob = max(1,(fu//cdf[ruu]) * binom.sf(ruu,ru,1//fu)) yield None, ('%d %d %f ' % (friend, follower, edgeprob))
def binomial_test(n, N, P): """Perform binomial test on the observed n being higher than expected. Specifically, N residues are at risk and of those there are n mutations occurred at the Np residues of interest. Given the background probability of a mutation at a specific residue, the p-value is calculated as the probability of observing n or greater mutations. Since N is large and n is small, it is computationally more efficient to take 1 - Pr(i<=n-1). Parameters ---------- n : int number of observed mutations N : int number of residues at risk P : float background probability that a mutation would occur at a single residue Returns ------- pval : np.array p-value for binomial test """ if n <= 0: return 1.0 pval = binom.sf(n-1, N, P) return pval
def binomial_p(x, n, p, alternative='greater'): """ Parameters ---------- x : array-like list of elements consisting of x in {0, 1} where 0 represents a failure and 1 represents a seccuess p : int hypothesized number of successes in n trials n : int number of trials alternative : {'greater', 'less', 'two-sided'} alternative hypothesis to test (default: 'greater') Returns ------- float estimated p-value """ assert alternative in ("two-sided", "less", "greater") if n < x: raise ValueError( "Cannot observe more successes than the population size") plower = binom.cdf(x, n, p) pupper = binom.sf(x - 1, n, p) if alternative == 'two-sided': pvalue = 2 * np.min([plower, pupper, 0.5]) elif alternative == 'greater': pvalue = pupper elif alternative == 'less': pvalue = plower return pvalue
def testScipy(N, p, outdir): fig, ax = plt.subplots(1, 1) x = list(range(0, N + 1)) ax.plot(x, binom.pmf(x, N, p), 'bo', ms=8, label='binom pmf') ax.plot(x, binom.cdf(x, N, p), 'ro', ms=8, label='binom cdf') ax.plot(x, binom.sf(x, N, p), 'go', ms=8, label='binom sf') ax.legend(loc='best', frameon=False) plt.savefig(outdir + "/distributions.png")
def value_func(value, round_left): sf = binom.sf(value, n, p) if round_left == 1: return sf prob = sf for _ in range(round_left - 1): prob = prob + sf - prob * sf assert prob <= 1 # this must always hold return prob
def binominal_backtest(failures, conf=0.05): """ Binominal backtest. Implementation based on on https://rdrr.io/cran/Dowd/src/R/BinomialBacktest.R """ size = failures.shape[0] failures = np.sum(failures) if failures >= size * conf: return binom.sf(failures - 1, size, conf) return binom.cdf(failures, size, conf)
def getMultiplePsFdr(iva, ivb, model, N, win=6): """ for the interval a and b, searching its nearby windows to estimate FDR and p-values. THe idea that using matched nearby windows, which could have similar distance with a & b, needs too many windows. return ra, rb, rab, es, fdr, hyp, chyp, pop, nbp """ ra, rb, rab = getPETsforRegions(iva, ivb, model) #simple hypergeometric test, the idea using cis_a + cis_b + trans_a+trans_b as M and cis_a+cis_b as N fails with all p-value as 1 hyp = hypergeom.sf(rab - 1.0, N, ra, rb) ivas, ivbs = getNearbyPairRegions(iva, ivb, win=win) hyps, rabs, nbps = [], [], [] for na in ivas: nraSource = getCounts(na, model[0]) nraTarget = getCounts(na, model[1]) nra = nraSource.union(nraTarget) nralen = float(len(nra)) if nralen < 1: continue for nb in ivbs: nrbSource = getCounts(nb, model[0]) nrbTarget = getCounts(nb, model[1]) nrb = nrbSource.union(nrbTarget) nrblen = len(nrb) if nrblen < 1: continue nrab = float(len(nra.intersection(nrb))) #nrab = float(len(nraSource.intersection(nrbTarget))) #collect the value for poisson test rabs.append(nrab) #collect the nearby hypergeometric test result nhyp = hypergeom.sf(nrab - 1.0, N, nralen, nrblen) hyps.append(nhyp) #collect the possibility for following binomal test den = nrab / (nralen * nrblen) nbps.append(den) if len(rabs) == 0: return ra, rb, rab, np.inf, 0.0, hyp, 0.0, 0.0, 0.0, hyps, rabs = np.array(hyps), np.array(rabs) #local fdr fdr = len(rabs[rabs > rab]) / float(len(rabs)) mrabs = float(np.mean(rabs)) #enrichment score if mrabs > 0: es = rab / mrabs else: es = np.inf #es = rab / max([np.mean(rabs),float(np.percentile(rabs,90))]) #es = rab / float(np.percentile(rabs,90)) #corrected hypergeometric fdr chyp = len(hyps[hyps < hyp]) / float(len(hyps)) #simple possion test, the idea benefits from MACS as using dynamic lambda lam = mrabs pop = poisson.sf(rab - 1.0, lam) #simple binomal test bp = np.mean(nbps) * ra * rb / N #nbp = binom.sf(rab, N, bp) nbp = binom.sf(rab - 1.0, N - rab, bp) return ra, rb, rab, es, fdr, hyp, chyp, pop, nbp
def pval(self, k): """Return the p-value corresponding to k, defined as 1 - cdf(k).""" if np.array_equal(self.p, self.p[0] * np.ones(self.p.shape)): # I all probabilities are equal, it returns the Binomial pvalue (as it should...) return binom.sf(k - 1, self.n, self.p[0]) elif k > 0: return 1. - self.cdf(k - 1) else: return 1.
def _compute_p_values(self, betas): """ Compute p-values of each predictor for Statistical Test of Variable Selection. """ # d_j: non-zero of j-th beta d_j = (betas != 0).sum(axis=1) # pi: the average of the selcetion ratio of all predictor variables in B boostrap samples. pi = d_j.sum() / betas.size return binom.sf(d_j - 1, n=self.B, p=pi)
def get_m_n_from_bernoulli(N): p, P_B = 0.05, 0.05 m_n_bernoulli = np.arange(1, N) * np.nan for n in np.arange(1, N): x = np.arange(binom.ppf(0.00, n, p), binom.ppf(1.00, n, p)) prob = binom.sf(x, n, p) m = find_m(prob, P_B) m_n_bernoulli[n - 1] = m * 1. / n return (m_n_bernoulli)
def testPeaks(degFN, dForm, allGeneInfo, gForm, switchStrand = False): #load/configure gene Info gNX = Nexus(allGeneInfo, gForm) gNX.load(['geneName', 'numReads', 'numSpots']) gName_numReads = {} gName_numSpots = {} while gNX.nextID(): gName_numReads[gNX.geneName] = gNX.numReads gName_numSpots[gNX.geneName] = gNX.numSpots #load degFN info dNX = Nexus(degFN, dForm) dNX.load(['tcc', 'eLevel', 'geneNames', 'pValBin']) while dNX.nextID(): gNames, readsForPeak = dNX.geneNames, dNX.eLevel chrom, strand, start, end = bioLibCG.tccSplit(dNX.tcc) if switchStrand: strand = -int(strand) pVals = [] for gName in gNames: #may have to change gene name cuz of multiple spans try: totGeneReads = gName_numReads[gName] numSpotsForGene = gName_numSpots[gName] except KeyError: try: gName = gName + '_RE_%s_%s' % (chrom, strand) totGeneReads = gName_numReads[gName] numSpotsForGene = gName_numSpots[gName] except KeyError: print "FIX THIS GENE NAME", gName continue #add psuedocount totGeneReads += 1 numSpotsForGene += 1 # not sure whether to do this yet... #check for hidden intron gene overlap try: q = 1.0/numSpotsForGene except ZeroDivisionError: continue #intron gene #add p val pVals.append(binom.sf(readsForPeak, totGeneReads, q)) dNX.pValBin = max(pVals) if pVals else -1.0 dNX.save()
def getEfficiencyGrid(self, sample, hitsPdf): eff = TH2D(sample + '_eff', sample + '_eff', 1000, 0, 100, 5, 0, 5) eff.Sumw2() eff.SetDirectory(0) for ix in range(hitsPdf.GetNbinsX()): chargeEfficiency = float(hitsPdf.Integral(ix+1, -1)) # already normalized for iy in range(5): p = binom.sf(iy-1, 4, chargeEfficiency) # 1 - cdf eff.SetBinContent(ix+1, iy+1, p) return eff
def _compute_p_values(self, betas): """ Compute p-values of each predictor for Statistical Test of Variable Selection. """ not_null = ~np.isnan(betas) # d_j: non-zero and notnull of j-th beta d_j = np.logical_and(not_null, betas != 0).sum(axis=1) # pi: the average of the selcetion ratio of all predictor variables in B boostrap samples. pi = d_j.sum() / not_null.sum().sum() return binom.sf(d_j - 1, n=self.B, p=pi)
def get_diff_pvalues_poisson( ref_matrix: ExpMatrix, comp_matrix: ExpMatrix) -> pd.Series: genes = ref_matrix.genes & comp_matrix.genes ref_num_transcripts = ref_matrix.median_transcript_count comp_num_transcripts = comp_matrix.median_transcript_count num_transcripts = (ref_num_transcripts + comp_num_transcripts) / 2.0 ref_matrix = ref_matrix.scale(num_transcripts) comp_matrix = comp_matrix.scale(num_transcripts) ref_matrix = ref_matrix.loc[genes] comp_matrix = comp_matrix.loc[genes] ref_num_cells = ref_matrix.num_cells comp_num_cells = comp_matrix.num_cells expressed = ((ref_matrix.sum(axis=1) + comp_matrix.sum(axis=1)) > 0) ref_matrix = ref_matrix.loc[expressed] comp_matrix = comp_matrix.loc[expressed] genes = ref_matrix.index.copy() num_genes = len(genes) pvals = np.ones(num_genes, dtype=np.float64) for i in range(num_genes): k1 = ref_matrix.iloc[i, :].sum() k2 = comp_matrix.iloc[i, :].sum() k = k1 + k2 # make sure k is integer using ceil() k_ceil = int(ceil(k)) # calculate factor and adjust k2 f = k_ceil / k k2 *= f # make sure k1 is integer using floor() # this is results in slightly conservative p-values k2_floor = int(floor(k2)) # calculate p of the binomal by taking n1 and n2 into account p = comp_num_cells / (ref_num_cells + comp_num_cells) # what is the probability of getting k or greater (out of n) randomly? # calculate lower tail: pvals[i] = binom.sf(k2_floor-1, k_ceil, p) pvals[pvals == 0] = np.nextafter(0, 1) # convert to series pvals = pd.Series(index=genes, data=pvals, name='pval') pvals.index.name = 'gene' return pvals
def calc_sf_all(v, n, p): sf_values = -np.log10(binom.sf(v, n, p)) sf_values[np.isinf(sf_values)] = 0 return sf_values # def multimap(n, func, it, **kw): # if n == 0: # try: # n = cpu_count() # except NotImplementedError: # n = 1 # # if n == 1: # # for s in it: # # result = func(s, best_res, **kw) # # if result: # # for x in result: # # peptide, m, snp_label, res = x # # for score, spec_t, c, info in res: # # if -score <= best_res.get(spec_t, 0): # # best_res_raw[spec_t] = [peptide, m, snp_label, score, spec_t, c, info] # # best_res[spec_t] = -score # # return best_res_raw, best_res # else: # qout = Queue() # count = 0 # while True: # qin = list(islice(it, 5000000)) # if not len(qin): # break # # print 'Loaded 500000 items. Ending cycle.' # procs = [] # for proc_num in range(n): # p = Process(target=worker, args=(qin, qout, proc_num, n, best_res, best_res_raw)) # p.start() # procs.append(p) # count = len(qin) # for _ in range(n): # for item in iter(qout.get, None): # for k, v in item.items(): # if -v[3] <= best_res.get(k, 0): # best_res_raw[k] = v # best_res[k] = -v[3] # # yield item # for p in procs: # p.join() # return best_res_raw, best_res
def plot_cummulative_over_rangeV(rO,N,rV): pH=[] pB=[] O=N*rO #print("N, O: ",N,", ",O,". Varying V:") for rVi in rV: p = math.floor(rVi/2) + 1 pH.append(100*hypergeom.sf(p, N, O, rVi)) #print(O," --> ", hypergeom.sf(p, N, O, rVi)) pB.append(100*binom.sf(p,rVi,rO)) return (pH,pB)
def plot_cummulative_over_rangeVmin(rO,N,rVmin): pH=[] pB=[] O=N*rO for rVi in rVmin: pmin = math.floor(rVi/2) + 1 pH.append(hypergeom.sf(pmin, N, O, rVi)) #print(O," --> ", hypergeom.sf(pmin, N, O, rVi)) pB.append(binom.sf(pmin,rVi,rO)) return (pH,pB)
def binomialTailTest(counts, nTrials, pEvent, oneSided=True): counts = array(counts) mean = nTrials * pEvent if oneSided: result = zeros(counts.shape) isAboveMean = counts > mean aboveIdx = isAboveMean.nonzero() belowIdx = (~isAboveMean).nonzero() result[aboveIdx] = binom.sf(counts[aboveIdx]-1, nTrials, pEvent) result[belowIdx] = binom.cdf(counts[belowIdx], nTrials, pEvent) else: diffs = abs(counts-mean) result = binom.cdf(mean-diffs, nTrials, pEvent) result += binom.sf(mean+diffs-1, nTrials, pEvent) return result
def getMultiplePsFdr(iva, ivb, model, N, win=5): """ for the interval a and b, searching its nearby windows to estimate FDR and p-values. return ra, rb, rab, es,es_ra,es_rb, fdr, hyp, pop, nbp """ ra, rb, rab = getPETsforRegions(iva, ivb, model) hyp = max([1e-300, hypergeom.sf(rab - 1.0, N, ra, rb)]) ivas, ivbs = getNearbyPairRegions(iva, ivb, win=win) #nras is a list for storing points ids for permutated regions nras, nrbs = [], [] for na in ivas: nraSource = getCounts(na, model[0]) nraTarget = getCounts(na, model[1]) nra = nraSource.union(nraTarget) nras.append(nra) for nb in ivbs: nrbSource = getCounts(nb, model[0]) nrbTarget = getCounts(nb, model[1]) nrb = nrbSource.union(nrbTarget) nrbs.append(nrb) #caculating the permutated background rabs, nbps = [], [] for nra in nras: nralen = float(len(nra)) for nrb in nrbs: nrblen = len(nrb) nrab = float(len(nra.intersection(nrb))) if nrab > 0: #collect the value for poisson test rabs.append(nrab) #collect the possibility for following binomial test den = nrab / (nralen * nrblen) nbps.append(den) else: nbps.append(0.0) rabs.append(0.0) if len(rabs) == 0: return ra, rb, rab, np.inf, 0.0, hyp, 0.0, 1e-300, 1e-300, rabs = np.array(rabs) #local fdr fdr = len(rabs[rabs > rab]) / float(len(rabs)) mrabs = float(np.mean(rabs)) #enrichment score if mrabs > 0: es = rab / np.mean(rabs[rabs > 0]) else: es = np.inf #simple possion test lam = mrabs pop = max([1e-300, poisson.sf(rab - 1.0, lam)]) #simple binomial test bp = np.mean(nbps) * ra * rb / N nbp = max([1e-300, binom.sf(rab - 1.0, N - rab, bp)]) return ra, rb, rab, es, fdr, hyp, pop, nbp
def min_depth(depth, error, threshold=0.98): ''' determine the maximum parental depth permitted from both parents We look at the minimum alternate depth across both parents. We need this to be low. This function determines how low we can set this and still capture the vast majority of true candidate de novo mutations. There are four posible scenarios: 1) both parents have depths below or equal to the depth 2) the first parent exceeds the depth but the second parent does not 3) the second parent exceeds the depth but the first parent does not 4) both parents exceed the depth. We only need to consider the fourth scenario. The probability that this happens (at a given depth) is 1 - prob(first parent exceeds) * prob(second parent exceeds). We repeatedly increment the depth threshold upwards until this probability is sufficiently high. Args: depth: depth of parental sequencing. Can either be a single value (as a summary value for both parents), or a list of two depths, one for each parent. error: site-specific error rate (e.g. 0.002) threshold: probability threshold that we are wanting to exceed. We look for a alt count where the probability exceeds this value. Assuming the value is 0.98, then 98% of cases will have a min depth less than the returned count. Returns: maximum permitted alternate depth across both parents. ''' # convert int variables to a list, so we don't need code specific to ints try: depth = [int(depth), int(depth)] except TypeError: pass # raise an error if the length is not two. assert len(depth) == 2 assert 0.0 < threshold < 1.0 x = 0 while True: product = 1 for i in depth: product *= binom.sf(x, i, error) prob = 1 - product if prob > threshold: return(x) x += 1
def _calculate_quasi_p(self, i): """Calculates quasi-p values as discussed in Bryant and Lempert (2010). This is a one sided binomial test. Parameters ---------- i : int the specific box in the peeling trajectory for which the quasi-p values are to be calculated. Returns ------- the quasi-p value """ box_lim = self._box_lims[i] restricted_dims = list(determine_restricted_dims( box_lim, self.prim._box_init)) # total nr. of cases in box Tbox = self.peeling_trajectory['mass'][i] * self.prim.n # total nr. of cases of interest in box Hbox = self.peeling_trajectory['coverage'][i] * self.prim.t_coi qp_values = {} for u in restricted_dims: temp_box = copy.deepcopy(box_lim) temp_box[u] = self._box_lims[0][u] indices = in_box(self.prim.x[self.prim.yi_remaining], temp_box) indices = self.prim.yi_remaining[indices] # total nr. of cases in box with one restriction removed Tj = indices.shape[0] # total nr. of cases of interest in box with one restriction # removed Hj = np.sum(self.prim.y[indices]) p = Hj/Tj Hbox = int(Hbox) Tbox = int(Tbox) qp = binom.sf(Hbox-1, Tbox, p) qp_values[u] = qp return qp_values
def quartet_analysis(tree, quartets, perms): # generate p-value distribution using Bernoulli model numTopo1 = 0 numTopo2 = 0 numTopo3 = 0 for perm in perms: nodeA = perm[0][0] nodeB = perm[0][1] nodeC = perm[1][0] nodeD = perm[1][1] distAB=tree.distance(nodeA, nodeB) distAC=tree.distance(nodeA, nodeC) distAD=tree.distance(nodeA, nodeD) if distAB == min(distAB, distAC, distAD): numTopo1+=1 elif distAC == min(distAB, distAC, distAD): numTopo2+=1 else: numTopo3+=1 P = float(numTopo1) / len(perms) # P is the binomial dist's p, estimated by permutations # count diff types of quartets if len(quartets) > 1000: selQ = random.sample(quartets, 1000) else: selQ = quartets numTopo1 = 0 numTopo2 = 0 numTopo3 = 0 for quartet in selQ: nodeA = quartet[0][0] nodeB = quartet[0][1] nodeC = quartet[1][0] nodeD = quartet[1][1] distAB=tree.distance(nodeA, nodeB) distAC=tree.distance(nodeA, nodeC) distAD=tree.distance(nodeA, nodeD) if distAB == min(distAB, distAC, distAD): numTopo1+=1 elif distAC == min(distAB, distAC, distAD): numTopo2+=1 else: numTopo3+=1 # calculate the p using binomial dist. p = binom.sf(numTopo1, len(selQ), P) # one tailed p-val, binomial dist. survival function return p
def prob_of_indel_with_error(input, soft_chr, soft_pos, prob): alignment = pysam.Samfile(input,'rb') total = alignment.count(soft_chr,soft_pos,soft_pos+1) try: reads = [read for read in alignment.fetch(soft_chr, soft_pos - 1, soft_pos + 2)] except ValueError: reads = [read for read in alignment.fetch(soft_chr, soft_pos, soft_pos + 1)] num_soft = 0 for each in reads: if each.is_secondary or each.is_unmapped: continue soft_len, soft_qual, soft_pos_read = get_softclip_length(each) if soft_len != 0 and abs(soft_pos_read - soft_pos) < 2: # +/- 1bp matching num_soft += 1 return binom.sf(num_soft - 1, total, prob)
def prop_test(df): """ Inspired from R package caret confusionMatrix.R """ from scipy.stats import binom x = np.diag(df).sum() n = df.sum().sum() p = (df.sum(axis=0) / df.sum().sum()).max() d = { "statistic": x, # number of successes "parameter": n, # number of trials "null.value": p, # probability of success "p.value": binom.sf(x - 1, n, p), # see https://en.wikipedia.org/wiki/Binomial_test } return(d)
def mask(self, count_threshold = 20, impute_threshold = 0.5): """ Mask locations that aren't heterozygotes and the loctions that don't meet a read count threshold. """ try: # Reducing the dataframe based on annotations ref_tup = zip(self.annot.index, self.annot["REF"]) alt_tup = zip(self.annot.index, self.annot["ALT"]) ref = self.df.ix[ref_tup] alt = self.df.ix[alt_tup] # Need to collapse the multi index # :TODO find a more rigorous way to do this ref.index = self.genotypes.index alt.index = self.genotypes.index hets = np.logical_or(self.genotypes < 0.5, self.genotypes > 1.5) sums = (ref + alt) < count_threshold ref[np.logical_or(hets, sums)] = np.NaN alt[np.logical_or(hets, sums)] = np.NaN self.binom_p = pd.DataFrame(binom.sf(alt - 1, ref + alt, 0.5), columns=self.df.columns,index=ref.index) self.ratio = ref/((ref+alt)/float(1)) except AttributeError: print("Need to run set_annotations and set_genotyeps first")
### Question 3 ''' (a) Generate 1,000 random numbers with Binomial distribution with 𝑛=44 and 𝑝=0.64. ''' n = 1000 binomial = bn.BinomialDistribution(0.64, 44) binomial.generateBinomialDistribution(n) print('Q3. a) Binomial Mean %f' % binomial.meanRdmNumber()) print(' Binomial Std %f' % binomial.stdDeviationRdmNumber()) ''' (b) (b) Draw the histogram. Compute the probability that the random variable X that has Binomial (44, 0.64) distribution, is at least 40: 𝑃(𝑋≥40). Use any statistics textbook or online resources for the exact number for the above probability and compare it with your finding and comment. ''' binomialdist = binomial.getBinomialDistribution() bins = np.linspace(0, 44, 45) plt.hist(binomialdist, bins, alpha=0.5) plt.title("Histogram of Binomial distribution(0.64,44)") plt.xlabel("Random Number") plt.ylabel("Frequency") plt.show() cdf_binfunction = binomial.getCumulativeDistribution(40) print('Q3. b) My Binomial P(X>= 40) %f' % (1-cdf_binfunction)) print(' Built in Binomial P(X>=40) %f ' % binom.sf(40, 44, 0.64))
def is_significant(self, alpha=0.05): Ny = len(self.id_list) N = Ny + len(self.id_list_negated) p_value = binom.sf(Ny, N, self.mcs) # print Ny, N, self.mcs, p_value return p_value < alpha
def get_p_value(self): Ny = len(self.id_list) N = Ny + len(self.id_list_negated) p_value = binom.sf(Ny, N, self.mcs) return p_value
def alleles(k, N): prob = binom.sf(N-1, 2**k, .25, loc=0) #1-cdf, where cdf is P<=quantile print prob
plt.style.use('Solarize_Light2') from scipy.special import comb from scipy.stats import binom from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import StandardScaler from sklearn.model_selection import cross_val_score from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeigborsClassifier from sklearn.tree # %% error_range = np.arange(0.0,1.01,0.01) n_classifier = 11 np.ceil(n_classifier/2) ens_errors=binom.sf(n_classifier - np.ceil(n_classifier/2),n_classifier,error_range) plt.plot(error_range,ens_errors,linewidth=2,label= 'Ensemble Errors') plt.plot(error_range,error_range,linestyle = '--',label = 'Base error') plt.legend(loc='upper left') plt.show() # %% # start writing for majority clasfier # class MajorityVoteClassifier_sush(): # return None # %% iris = datasets.load_iris() X,y = iris.data[50:,[1,2]], iris.target[50:] le = LabelEncoder() y = le.fit_transform(y) X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.5,random_state = 1,stratify =y)
def main(): usage = 'usage: %prog [options] <feature gff>' parser = OptionParser(usage) parser.add_option('-g', dest='filter_gff', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]') parser.add_option('-r', dest='repeats_gff', default='%s/research/common/data/genomes/hg19/annotation/repeatmasker/hg19.fa.out.tp.gff' % os.environ['HOME']) parser.add_option('-s', dest='strand_split', default=False, action='store_true', help='Split statistics by strand [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide a gff file for the feature of interest.') else: feature_gff = args[0] ############################################ # GFF filter ############################################ # filter TEs and features by gff file if options.filter_gff: filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp() subprocess.call('sortBed -i %s | mergeBed -i - > %s' % (options.filter_gff, filter_merged_bed_file), shell=True) # filter TE GFF te_gff_fd, te_gff_file = tempfile.mkstemp() subprocess.call('intersectBed -a %s -b %s > %s' % (options.repeats_gff, filter_merged_bed_file, te_gff_file), shell=True) options.repeats_gff = te_gff_file # filter feature GFF feature_gff_gff_fd, feature_gff_gff_file = tempfile.mkstemp() subprocess.call('intersectBed -u -f 0.5 -a %s -b %s > %s' % (feature_gff, filter_merged_bed_file, feature_gff_gff_file), shell=True) feature_gff = feature_gff_gff_file ############################################ # lengths ############################################ # compute feature length feature_len, feature_num = feature_stats(feature_gff) if feature_num == 0: print >> sys.stderr, 'Zero features' exit() # compute size of search space if options.filter_gff: genome_length = count_bed(filter_merged_bed_file, feature_len) else: genome_length = count_hg19() # hash counted repeat genomic bp te_lengths = te_target_size(options.repeats_gff, feature_len) ############################################ # hash TE/feature overlaps ############################################ # initialize te_features = {} for rep, fam in te_lengths: if options.strand_split: te_features[(rep+'+',fam)] = set() te_features[('*+',fam)] = set() te_features[('*+','*')] = set() te_features[(rep+'-',fam)] = set() te_features[('*-',fam)] = set() te_features[('*-','*')] = set() else: te_features[(rep,fam)] = set() te_features[('*',fam)] = set() te_features[('*','*')] = set() p = subprocess.Popen('intersectBed -wo -a %s -b %s' % (options.repeats_gff,feature_gff), shell=True, stdout=subprocess.PIPE) for line in p.stdout: a = line.split('\t') kv = gff.gtf_kv(a[8]) rep = kv['repeat'] fam = kv['family'] fchrom = a[9] fstart = int(a[12]) fend = int(a[13]) rep_star = '*' if options.strand_split: tstrand = a[6] fstrand = a[15] if tstrand == fstrand: rep += '+' rep_star += '+' else: rep += '-' rep_star += '-' te_features[(rep,fam)].add((fchrom,fstart,fend)) te_features[(rep_star,fam)].add((fchrom,fstart,fend)) te_features[(rep_star,'*')].add((fchrom,fstart,fend)) p.communicate() ############################################SW # compute stats and print ############################################ lines = [] p_vals = [] for te in te_features: rep, fam = te if options.strand_split: te_len = te_lengths[(rep[:-1],fam)] te_p = float(te_len) / (2*genome_length) else: te_len = te_lengths[(rep,fam)] te_p = float(te_len) / genome_length te_count = len(te_features.get(te,[])) exp_count = te_p * feature_num fold_change = te_count / exp_count if fold_change > 1: p_val = binom.sf(te_count-1, feature_num, te_p) else: p_val = binom.cdf(te_count, feature_num, te_p) p_vals.append(p_val) cols = (rep, fam, te_len, te_count, exp_count, fold_change, p_val) lines.append('%-18s %-18s %9d %8d %8.1f %8.2f %10.2e' % cols) # correct for multiple hypotheses correction q_vals = fdr.ben_hoch(p_vals) for i in range(len(lines)): qline = lines[i] + ' %10.2e' % q_vals[i] print qline ############################################ # clean ############################################ if options.filter_gff: os.close(filter_merged_bed_fd) os.remove(filter_merged_bed_file) os.close(te_gff_fd) os.remove(te_gff_file) os.close(feature_gff_gff_fd) os.remove(feature_gff_gff_file)
def create_plot( meme_file, motif_number, flanking_sites, sample_phylop_file, control_phylop_file, sample_gerp_file, control_gerp_file, peak_file, fimo_file, annotate, ): handle = open(meme_file) records = motifs.parse(handle, "meme") record = records[motif_number - 1] num_occurrences = getattr(record, "num_occurrences", "Unknown") sample_phylo_data = None control_phylo_data = None sample_gerp_data = None control_gerp_data = None annotate_dict = None if annotate == "" or annotate == " ": annotate = None elif annotate: with open(annotate) as f: annotate_dict = json.load(f) handle = open(sample_phylop_file, "r") sample_phylo_data = csv.reader(handle, delimiter="\t") handle = open(control_phylop_file, "r") control_phylo_data = csv.reader(handle, delimiter="\t") if sample_gerp_file and control_gerp_file: handle = open(sample_gerp_file, "r") sample_gerp_data = csv.reader(handle, delimiter="\t") handle = open(control_gerp_file, "r") control_gerp_data = csv.reader(handle, delimiter="\t") sample_phylo_scores = [] for line in sample_phylo_data: sample_phylo_scores.append(float(line[1])) control_phylo_scores = [] for line in control_phylo_data: control_phylo_scores.append(float(line[1])) if sample_gerp_data: sample_gerp_scores = [] for line in sample_gerp_data: sample_gerp_scores.append(float(line[1])) control_gerp_scores = [] for line in control_gerp_data: control_gerp_scores.append(float(line[1])) assert len(sample_phylo_scores) == len(control_phylo_scores) handle.close() profile = position_wise_profile(getattr(record, score_type), record.length) max_occur = find_max_occurence(profile, max_count=1) ## motif_scores is tn array of scores of the max occuring base at each position of the motif motif_scores = [] for position in max_occur: motif_scores.append(position[0][1]) motif_scores = np.asarray(motif_scores) sample_phylo_scores = np.asarray(sample_phylo_scores) control_phylo_scores = np.asarray(control_phylo_scores) if sample_gerp_data: sample_gerp_scores = np.asarray(sample_gerp_scores) control_gerp_scores = np.asarray(control_gerp_scores) motif_junk = [0 for i in range(0, flanking_sites)] motif_junk = np.asarray(motif_junk) motif_concat = np.concatenate((motif_junk, motif_scores)) motif_concat = np.concatenate((motif_concat, motif_junk)) ##Mean of flanking sites ms_p = np.mean(np.concatenate((sample_phylo_scores[0:flanking_sites], sample_phylo_scores[-flanking_sites:]))) mc_p = np.mean(np.concatenate((control_phylo_scores[0:flanking_sites], control_phylo_scores[-flanking_sites:]))) if sample_gerp_data: ms_g = np.mean(np.concatenate((sample_gerp_scores[0:flanking_sites], sample_gerp_scores[-flanking_sites:]))) mc_g = np.mean(np.concatenate((control_gerp_scores[0:flanking_sites], control_gerp_scores[-flanking_sites:]))) flanking_sample_gerp_scores = np.concatenate( (sample_gerp_scores[0:flanking_sites], sample_gerp_scores[-flanking_sites:]) ) flanking_control_gerp_scores = np.concatenate( (control_gerp_scores[0:flanking_sites], control_gerp_scores[-flanking_sites:]) ) motif_control_gerp_scores = control_gerp_scores[flanking_sites:-flanking_sites] motif_sample_gerp_scores = sample_gerp_scores[flanking_sites:-flanking_sites] flanking_sample_phylo_scores = np.concatenate( (sample_phylo_scores[0:flanking_sites], sample_phylo_scores[-flanking_sites:]) ) flanking_control_phylo_scores = np.concatenate( (control_phylo_scores[0:flanking_sites], control_phylo_scores[-flanking_sites:]) ) motif_control_phylo_scores = control_phylo_scores[flanking_sites:-flanking_sites] motif_sample_phylo_scores = sample_phylo_scores[flanking_sites:-flanking_sites] if flanking_sites > 0: shifted_sample_phylo_scores = sample_phylo_scores[flanking_sites:-flanking_sites] - ms_p shifted_control_phylo_scores = control_phylo_scores[flanking_sites:-flanking_sites] - mc_p if sample_gerp_data: shifted_sample_gerp_scores = sample_gerp_scores[flanking_sites:-flanking_sites] - ms_g shifted_control_gerp_scores = control_gerp_scores[flanking_sites:-flanking_sites] - mc_g else: shifted_sample_phylo_scores = sample_phylo_scores shifted_control_phylo_scores = control_phylo_scores if sample_gerp_data: shifted_sample_gerp_scores = sample_gerp_scores shifted_control_gerp_scores = control_gerp_scores pr_p = pearsonr(motif_scores, motif_sample_phylo_scores) if sample_gerp_data: pr_g = pearsonr(motif_scores, motif_sample_gerp_scores) ## H_0: Mean phylop scores for motif sites and flanking sites are the same ## H_!: Mean phylop score for motif sites > Mean phylop score of flanking sites ## NOTE: the perform_t_test functions returns a 2 tailed p-value forn independet t-test with unequal sample size, eqaul variances T_deltaphylop, p_deltaphylop = perform_t_test(motif_sample_phylo_scores, flanking_sample_phylo_scores) delta_phylop = np.mean(motif_sample_phylo_scores) - np.mean( flanking_sample_phylo_scores ) # -shifted_control_phylo_scores) if sample_gerp_data: T_deltagerp, p_deltagerp = perform_t_test(motif_sample_gerp_scores, flanking_sample_gerp_scores) delta_gerp = np.mean(motif_sample_gerp_scores) - np.mean(flanking_sample_gerp_scores) if T_deltagerp < 0: p_deltagerp = 1 - p_deltagerp * 0.5 else: p_deltagerp = p_deltagerp * 0.5 if T_deltaphylop < 0: p_deltaphylop = 1 - p_deltaphylop * 0.5 else: p_deltaphylop = p_deltaphylop * 0.5 ## Ordinary least square fit for phylop scores and motif_scores reg_phylop_sample = sm.OLS(motif_sample_phylo_scores, sm.add_constant(motif_scores)).fit() if len(reg_phylop_sample.params) < 2: y_reg_phylop_sample = motif_scores else: y_reg_phylop_sample = motif_scores * reg_phylop_sample.params[1] + reg_phylop_sample.params[0] reg_phylop_control = sm.OLS(motif_control_phylo_scores, sm.add_constant(motif_scores)).fit() if len(reg_phylop_control.params) < 2: y_reg_phylop_control = motif_scores else: y_reg_phylop_control = motif_scores * reg_phylop_control.params[1] + reg_phylop_control.params[0] if sample_gerp_data: reg_gerp_sample = sm.OLS(motif_sample_gerp_scores, sm.add_constant(motif_scores)).fit() if len(reg_gerp_sample.params) == 1: y_reg_gerp_sample = motif_scores else: y_reg_gerp_sample = motif_scores * reg_gerp_sample.params[1] + reg_gerp_sample.params[0] reg_gerp_control = sm.OLS(motif_control_gerp_scores, sm.add_constant(motif_scores)).fit() if len(reg_gerp_control.params) == 1: y_reg_gerp_control = motif_scores else: y_reg_gerp_control = motif_scores * reg_gerp_control.params[1] + reg_gerp_control.params[0] motif = record motif_length = motif.length meme_dir = os.path.dirname(meme_file) X = [40 + 15] ## this is by trial and error, the position for the first base logo logo = plt.imread(os.path.join(meme_dir, "logo{}.png".format(motif_number))) ## Generate all other X coordinates fs = flanking_sites for j in range(1, len(motif) + 2 * fs): t = X[j - 1] + a + 1.9 X.append(t) motif_bits = [] for i in range(0, motif.length): s = 0 for base in bases: s = s + -motif.pwm[base][i] * log(motif.pwm[base][i], 2) if motif.pwm[base][i] != 0 else s s = 2 - s motif_bits.append(s) y_phylop_pixels = [__scale__ * x for x in sample_phylo_scores] # [fs:-fs]]#[flanking_sites:-flanking_sites]] ##FIXME This is a big dirty hacl to get thegenerate plots for the Reverse complement logo too logo_name = ["logo{}.png".format(motif_number), "logo_rc{}.png".format(motif_number)] for ln in logo_name: if "rc" in ln: y_phylop_pixels.reverse() logo = plt.imread(os.path.join(meme_dir, ln)) height_px = logo.shape[0] # Should be 212 if sample_gerp_data: if annotate: total_px = X[-1] + 8 * height_px + 140 right = (8 * height_px + 10 + 140 - 0.2 * height_px) / total_px else: total_px = X[-1] + 6 * height_px + 140 right = (6 * height_px + 10 + 140 - 0.2 * height_px) / total_px else: if annotate: total_px = X[-1] + 6 * height_px + 140 right = (6 * height_px + 10 + 140 - 0.2 * height_px) / total_px else: total_px = X[-1] + 4 * height_px + 140 right = (4 * height_px + 10 + 140 - 0.2 * height_px) / total_px figsize = (total_px / 100, (2 * height_px) / 100 + 0.6) gs = gridspec.GridSpec(2, 1) # , width_ratios=[1, right], height_ratios=[1,1]) gs.update( top=1.0, bottom=0.14, left=0.08, right=1 - right ) # , right=0.8)#, left=0.06)#, right=right, wspace=0.025, hspace=0.03, wd) f = plt.figure(figsize=figsize, dpi=dpi, facecolor="w", edgecolor="k") # ax => Logo # stem_plot => Trend # gerp_scatter_plot => Phylop # enrichment_plot => Gerp logo_plot = plt.Subplot(f, gs[0]) ##TODO Check this if motif_length > 45: XSCALE_FACTOR = motif_length / 1.9 z = 2 elif motif_length > 40: XSCALE_FACTOR = motif_length / 2.25 z = 2.5 elif motif_length > 36: XSCALE_FACTOR = motif_length / 1.95 z = 2 elif motif_length > 21: XSCALE_FACTOR = motif_length / 5 z = 3 else: XSCALE_FACTOR = 4.5 z = 3 logo_plot.imshow( logo, extent=[40 + 15 + z * (a + 1.9), logo.shape[1] + 15 + XSCALE_FACTOR * (a + 1.9), 0, logo.shape[0]] ) logo_plot.set_axis_off() f.add_subplot(logo_plot) stem_plot = plt.Subplot(f, gs[1], sharex=logo_plot) markerline, stemlines, baseline = stem_plot.stem( X[:fs], [y for y in y_phylop_pixels[:fs]], markerfmt="_", linefmt="-", markerfacecolor=flankingstemcolor, color=greycolor, ) setp(stemlines, "color", flankingstemcolor) setp(markerline, "markerfacecolor", flankingstemcolor) setp(markerline, "color", flankingstemcolor) setp(stemlines, "linewidth", linewidth) setp(markerline, "markersize", markersize) setp(baseline, "linewidth", linewidth - 0.5) setp(markerline, "markeredgewidth", markeredgewidth) markerline, stemlines, baseline = stem_plot.stem( X[fs:-fs], [y for y in y_phylop_pixels[fs:-fs]], markerfmt="g_", linefmt="g-", basefmt="r-" ) setp(stemlines, "linewidth", linewidth) setp(markerline, "markersize", markersize) setp(markerline, "markeredgewidth", markeredgewidth) setp(baseline, "linewidth", linewidth - 0.5) markerline, stemlines, baseline = stem_plot.stem( X[-fs:], [y for y in y_phylop_pixels[-fs:]], markerfmt="_", linefmt="-", markerfacecolor=flankingstemcolor, color=greycolor, ) setp(stemlines, "color", flankingstemcolor) setp(markerline, "markerfacecolor", flankingstemcolor) setp(stemlines, "linewidth", linewidth) setp(markerline, "markersize", markersize) setp(markerline, "markeredgewidth", markeredgewidth) setp(markerline, "color", flankingstemcolor) setp(baseline, "linewidth", linewidth - 0.5) indices_str = [] indices1 = np.linspace(-fs, -1, 2) for i in indices1: indices_str.append("") indices2 = np.arange(0, len(X) - 2 * fs, 5) for i in indices2: indices_str.append("${}$".format(int(i) + 1)) indices3 = np.linspace(motif_length, motif_length + fs - 1, 2) for i in indices3: indices_str.append("") indices12 = np.concatenate((indices1, indices2)) indices = np.concatenate((indices12, indices3)) xticks = [X[int(i) + fs] for i in indices] max_yticks = 3 yloc = plt.MaxNLocator(max_yticks) stem_plot.yaxis.set_major_locator(yloc) # ticks_and_labels = np.linspace(1.02*min(min(y_phylop_pixels), -0.1), 1.02*max(y_phylop_pixels), num = 5, endpoint=True) # stem_plot.set_yticks(ticks_and_labels) # stem_plot.set_yticklabels(['$%.2f$' %x for x in ticks_and_labels])#(["%0.2f"%(min(y_phylop_pixels)/__scale__), "%0.2f"%(np.mean(y_phylop_pixels)/__scale__), "%0.2f"%(max(y_phylop_pixels)/__scale__)], fontsize=fontsize) stem_plot.set_xlabel("$\mathrm{Base}\ \mathrm{Position}$", fontsize=fontsize, fontweight="bold") stem_plot.set_xlim([1.2 * a, X[-1] + linewidth * 1.8]) stem_plot.set_ylim([min(np.min(y_phylop_pixels), -0.01) - 0.03, np.max(y_phylop_pixels, 0.01)]) stem_plot.get_xaxis().tick_bottom() stem_plot.get_yaxis().tick_left() stem_plot.set_xticks(xticks) stem_plot.set_xticklabels(indices_str, fontsize=fontsize) stem_plot.spines["top"].set_visible(False) stem_plot.spines["right"].set_visible(False) stem_plot.yaxis.set_ticks_position("left") stem_plot.xaxis.set_ticks_position("bottom") stem_plot.spines["left"].set_position("zero") # stem_plot.spines['bottom'].set_position(matplotlib.transforms.Bbox(array([[0.125,0.63],[0.25,0.25]]))) stem_plot.get_yaxis().set_tick_params(direction="out") stem_plot.get_xaxis().set_tick_params(direction="out") stem_plot.tick_params(axis="y", which="major", pad=tickpad) stem_plot.tick_params(axis="x", which="major", pad=tickpad) stem_plot.tick_params("both", length=ticklength, width=2, which="major") stem_plot.set_ylabel("$\mathrm{PhyloP}\ \mathrm{Score}$", fontsize=fontsize) f.add_subplot(stem_plot) if sample_gerp_data: if annotate: gs1 = gridspec.GridSpec(2, 4, height_ratios=[1, 4], width_ratios=[1, 1, 1, 1]) gerp_header_subplot_gs = gs1[0, 1] gerp_subplot_gs = gs1[1, 1] histogram_header_subplot_gs = gs1[0, 2] histogram_subplot_gs = gs1[1, 2] ann_header_subplot_gs = gs1[0, 3] ann_subplot_gs = gs1[1, 3] else: gs1 = gridspec.GridSpec(2, 3, height_ratios=[1, 4], width_ratios=[1, 1, 1]) gerp_header_subplot_gs = gs1[0, 1] gerp_subplot_gs = gs1[1, 1] histogram_header_subplot_gs = gs1[0, 2] histogram_subplot_gs = gs1[1, 2] else: if annotate: gs1 = gridspec.GridSpec(2, 3, height_ratios=[1, 4], width_ratios=[1, 1, 1]) histogram_header_subplot_gs = gs1[0, 1] histogram_subplot_gs = gs1[1, 1] ann_header_subplot_gs = gs1[0, 2] ann_subplot_gs = gs1[1, 2] else: gs1 = gridspec.GridSpec(2, 2, height_ratios=[1, 4], width_ratios=[1, 1]) histogram_header_subplot_gs = gs1[0, 1] histogram_subplot_gs = gs1[1, 1] gs1.update(bottom=0.14, right=0.95, left=1 - right * 0.85, wspace=0.5) phlyop_plots_leg = plt.Subplot(f, gs1[0, 0], autoscale_on=True) pearsonr_pval = str("%.1g" % pr_p[1]) if "e" in pearsonr_pval: pearsonr_pval += "}" pearsonr_pval = pearsonr_pval.replace("e", "*10^{").replace("-0", "-") score_pval = str("%.1g" % p_deltaphylop) if "e" in score_pval: score_pval += "}" score_pval = score_pval.replace("e", "*10^{").replace("-0", "-") textstr = r"\noindent$R_{pearson}=%.2f$($p=%s$)\\~\\$\Delta_{Phylop}=%.2f$($p=%s$)\\~\\" % ( pr_p[0], pearsonr_pval, delta_phylop, score_pval, ) # , reg_phylop_control.rsquared, num_occurrences*reg_phylop_control.params[1]) txtx = 1 - legend_xmultiplier * len(textstr) / 100.0 phlyop_plots_leg.set_frame_on(False) phlyop_plots_leg.set_xticks([]) phlyop_plots_leg.set_yticks([]) phlyop_plots_leg.text(txtx, txty, textstr, fontsize=legend_fontsize) f.add_subplot(phlyop_plots_leg) phylop_scatter_plot = plt.Subplot(f, gs1[1, 0], autoscale_on=True) fit = np.polyfit(motif_scores, motif_sample_phylo_scores, 1) fit_fn = np.poly1d(fit) phylop_scatter_plot.scatter( motif_scores, motif_sample_phylo_scores, color="g", s=[pointsize for i in motif_scores] ) phylop_scatter_plot.plot( motif_scores, y_reg_phylop_sample, "g", motif_scores, fit_fn(motif_scores), color="g", linewidth=plot_linewidth, ) phylop_scatter_plot.scatter( motif_scores, motif_control_phylo_scores, color=greycolor, s=[pointsize for i in motif_scores] ) phylop_scatter_plot.plot(motif_scores, y_reg_phylop_control, color=greycolor, linewidth=plot_linewidth) ticks_and_labels = np.linspace(1.02 * min(motif_scores), 1.02 * max(motif_scores), num=5, endpoint=True) phylop_scatter_plot.set_xticks(ticks_and_labels) ticks_and_labels = ["$%.2f$" % (x / num_occurrences) for x in ticks_and_labels] phylop_scatter_plot.set_xticklabels(ticks_and_labels) ##max_xticks = 5 ##xloc = plt.MaxNLocator(max_xticks) ##print xloc ##phylop_scatter_plot.xaxis.set_major_locator(xloc) # ticks_and_labels = np.linspace(1.02*min(min(shifted_sample_phylo_scores), min(shifted_control_phylo_scores)), 1.02*max(max(shifted_sample_phylo_scores),max(shifted_control_phylo_scores)), # num = 4, endpoint=True) # phylop_scatter_plot.set_yticks(ticks_and_labels) # phylop_scatter_plot.set_yticklabels(["$%0.2f$"%x for x in ticks_and_labels]) max_yticks = 4 yloc = plt.MaxNLocator(max_yticks) phylop_scatter_plot.yaxis.set_major_locator(yloc) phylop_scatter_plot.set_xlabel("$\mathrm{Base}\ \mathrm{Frequency}$", fontsize=fontsize, fontweight="bold") phylop_scatter_plot.get_xaxis().tick_bottom() phylop_scatter_plot.get_yaxis().tick_left() phylop_scatter_plot.set_ylabel("$\mathrm{PhyloP}\ \mathrm{Score}$", fontsize=fontsize, fontweight="bold") phylop_scatter_plot.tick_params(axis="y", which="major", pad=tickpad) phylop_scatter_plot.tick_params(axis="x", which="major", pad=tickpad) phylop_scatter_plot.get_yaxis().set_tick_params(direction="out") phylop_scatter_plot.get_xaxis().set_tick_params(direction="out") phylop_scatter_plot.tick_params("both", length=ticklength, width=2, which="major") f.add_subplot(phylop_scatter_plot) gerp_plots_leg = plt.Subplot(f, gerp_header_subplot_gs, autoscale_on=True) gerp_plots_leg.set_frame_on(False) gerp_plots_leg.set_xticks([]) gerp_plots_leg.set_yticks([]) pearsonr_pval = str("%.1g" % pr_p[1]) if "e" in pearsonr_pval: pearsonr_pval += "}" pearsonr_pval = pearsonr_pval.replace("e", "*10^{").replace("-0", "-") if sample_gerp_data: score_pval = str("%.1g" % p_deltagerp) if "e" in score_pval: score_pval += "}" score_pval = score_pval.replace("e", "*10^{").replace("-0", "-") textstr = r"\noindent$R_{pearson}=%.2f$($p=%s$)\\~\\$\Delta_{{Gerp}}=%.2f$($p=%s$)\\~\\" % ( pr_g[0], pearsonr_pval, delta_gerp, score_pval, ) txtx = 1 - legend_xmultiplier * len(textstr) / 100.0 gerp_plots_leg.text(txtx, txty, textstr, fontsize=legend_fontsize) f.add_subplot(gerp_plots_leg) gerp_scatter_plot = plt.Subplot(f, gerp_subplot_gs, autoscale_on=True) gerp_scatter_plot.scatter( motif_scores, motif_sample_gerp_scores, color="g", s=[pointsize for i in motif_scores] ) gerp_scatter_plot.plot(motif_scores, y_reg_gerp_sample, color="g", linewidth=plot_linewidth) gerp_scatter_plot.scatter( motif_scores, motif_control_gerp_scores, color=greycolor, s=[pointsize for i in motif_scores] ) gerp_scatter_plot.plot(motif_scores, y_reg_gerp_control, color=greycolor, linewidth=plot_linewidth) ticks_and_labels = np.linspace(1.02 * min(motif_scores), 1.02 * max(motif_scores), num=5, endpoint=True) gerp_scatter_plot.set_xticks(ticks_and_labels) ticks_and_labels = ["$%.2f$" % (x / num_occurrences) for x in ticks_and_labels] gerp_scatter_plot.set_xticklabels(ticks_and_labels) ##max_xticks = 5 ##xloc = plt.MaxNLocator(max_xticks) ##gerp_scatter_plot.xaxis.set_major_locator(xloc) max_yticks = 4 yloc = plt.MaxNLocator(max_yticks) gerp_scatter_plot.yaxis.set_major_locator(yloc) gerp_scatter_plot.set_xlabel("$\mathrm{Base}\ \mathrm{Frequency}$", fontsize=fontsize, fontweight="bold") gerp_scatter_plot.set_ylabel("$\mathrm{GERP}\ \mathrm{Score}$", fontsize=fontsize, fontweight="bold") gerp_scatter_plot.get_xaxis().tick_bottom() gerp_scatter_plot.get_yaxis().tick_left() gerp_scatter_plot.get_yaxis().set_tick_params(direction="out") gerp_scatter_plot.get_xaxis().set_tick_params(direction="out") gerp_scatter_plot.tick_params(axis="y", which="major", pad=tickpad) gerp_scatter_plot.tick_params(axis="x", which="major", pad=tickpad) gerp_scatter_plot.tick_params("both", length=ticklength, width=2, which="major") f.add_subplot(gerp_scatter_plot) enrichment_plot4 = plt.Subplot(f, histogram_header_subplot_gs, autoscale_on=True) enrichment_plot4.set_frame_on(False) enrichment_plot4.set_xticks([]) enrichment_plot4.set_yticks([]) all_distances = get_motif_distances(peak_file, fimo_file) fimo_dir = os.path.dirname(fimo_file) motifs_within_100 = filter(lambda x: x <= 100 and x >= -100, all_distances) motifs_within_100_200 = filter(lambda x: (x < 200 and x > 100) or (x > -200 and x < -100), all_distances) if len(motifs_within_100_200) > 0: enrichment = len(motifs_within_100) / (len(motifs_within_100_200)) # +len(motifs_within_100)) else: enrichment = 1 enrichment_pval = 0 number_of_sites = len(motifs_within_100) + len(motifs_within_100_200) # fimo_sites_intersect(parsed.fimo_file) probability = 200 / (ENRICHMENT_SEQ_LENGTH - motif_length) enrichment_pval = binom.sf(len(motifs_within_100), number_of_sites, probability) enrichment_pval = str("%.1g" % enrichment_pval) if "e" in enrichment_pval: enrichment_pval += "}" enrichment_pval = enrichment_pval.replace("e", "*10^{").replace("-0", "-") textstr = r"\noindent$Enrichment={0:.2f}$\\~\\$(p={1})$".format(enrichment, enrichment_pval) txtx = 0.1 * len(textstr) / 100.0 enrichment_plot4.text(txtx, txty, textstr, fontsize=legend_fontsize) f.add_subplot(enrichment_plot4) enrichment_plot = plt.Subplot(f, histogram_subplot_gs, autoscale_on=True) enrichment_plot.hist(all_distances, histogram_nbins, color="white", alpha=0.8, range=[-200, 200]) enrichment_plot.set_xticks([-200, -100, 0, 100, 200]) max_yticks = 3 yloc = plt.MaxNLocator(max_yticks) enrichment_plot.yaxis.set_major_locator(yloc) # enrichment_plot.set_yticks(range(1,6)) ticks_and_labels = [-200, -100, 0, 100, 200] all_distances = np.asarray(all_distances) enrichment_plot.set_xticklabels(["${}$".format(x) for x in ticks_and_labels]) enrichment_plot.tick_params(axis="y", which="major", pad=tickpad) enrichment_plot.tick_params(axis="x", which="major", pad=tickpad) enrichment_plot.tick_params("both", length=ticklength, width=2, which="major") enrichment_plot.get_xaxis().tick_bottom() enrichment_plot.get_yaxis().tick_left() enrichment_plot.get_yaxis().set_tick_params(direction="out") enrichment_plot.get_xaxis().set_tick_params(direction="out") enrichment_plot.axvline(x=-100, linewidth=3, color="red", linestyle="-.") enrichment_plot.axvline(x=100, linewidth=3, color="red", linestyle="-.") f.add_subplot(enrichment_plot) if "rc" not in ln: out_file = os.path.join(fimo_dir, "motif{}Combined_plots.png".format(motif_number)) out_file = "motif{}Combined_plots.png".format(motif_number) else: out_file = os.path.join(fimo_dir, "motif{}Combined_plots_rc.png".format(motif_number)) out_file = "motif{}Combined_plots_rc.png".format(motif_number) if annotate: filename = r"$" + annotate[0] + "$" try: a_motif = r"$" + annotate[1] + "$" except IndexError: a_motif = "" try: cell_line = r"$" + annotate[2] + "$" except IndexError: cell_line = "" try: assay = r"$" + annotate[3] + "$" except IndexError: assay = "" # data = [[r'$Filename$', filename], [r'$Motif$', a_motif], [r'$Cell\ Line$', cell_line], [r'Assay', assay]] keys = ["title", "gene_name", "dataset", "assembly"] data = [[r"$" + key.replace("_", " ").upper() + "$", r"$" + annotate_dict[key] + "$"] for key in keys] ann_header = plt.Subplot(f, ann_header_subplot_gs, autoscale_on=True) ann_header.set_frame_on(False) ann_header.set_xticks([]) ann_header.set_yticks([]) f.add_subplot(ann_header) textstr = r"$Metadata$" txtx = 1.7 * len(textstr) / 100.0 ann_header.text(txtx, txty, textstr, fontsize=legend_fontsize) ann_plot = plt.Subplot(f, ann_subplot_gs, autoscale_on=True) ann_plot.set_xticks([]) ann_plot.set_yticks([]) ann_plot.set_frame_on(False) table = ann_plot.table(cellText=data, loc="center") table.scale(1, 2) fontproperties = FontProperties(size=legend_fontsize * 8) # , family='serif' ) for key, cell in table.get_celld().items(): row, col = key if row > 0 and col > 0: cell.set_text_props(fontproperties=fontproperties) table.set_fontsize(legend_fontsize * 8) f.add_subplot(ann_plot) f.savefig(out_file, figsize=figsize, dpi=dpi)
def codegen_range_checks(self, probs, failchance): # probs: List of tuples of (probability, orders) # where probability: float in range [0., 1.] # and orders: list of strings representing selection orders with this probability. # # failchance: Chance of Type I error (i.e. chance of a test failing for working code # due to random chance alone). Too small of a magnitude results in worthless tests # unless the number of samples is very large to compensate. # # Output: # Prints code for verifying that the number of samples counted for each selection order # falls within acceptible bounds. self.assertIsNot(self.status, self.STATUS_PREPARE, 'codegen should only be done after self.prepare()') self.assertAlmostEqual(1., sum(sorted(p*len(os) for (p,os) in probs)), 'total probability should be 1') allorders = [] for p,os in probs: allorders.extend(os) self.assertEqual(len(allorders), len(set(allorders)), 'all orders in probs should be unique') # find acceptible ranges for each item N = self.nsamples ranges = [] for prob, orders in probs: cdfvals = binom.cdf(np.arange(N+1), N, prob) sfvals = binom.sf(N-np.arange(N+1), N, prob) # in backwards (increasing) order for bisect ranges.append(( bisect_left(cdfvals, failchance), # lo N - bisect_right(sfvals, failchance) # hi )) # format string for range (align the numbers) lolen = max(len(str(lo)) for (lo,hi) in ranges) hilen = max(len(str(hi)) for (lo,hi) in ranges) range_fmt = '({:%dd}, {:%dd})' % (lolen, hilen) # sort descendingly by hi for easier comparison of ranges zipped = [(p,o,l,h) for (p,o),(l,h) in zip(probs,ranges)] zipped.sort(key=lambda tup: -tup[3]) probs,ranges = zip(*[((p,o),(l,h)) for p,o,l,h in zipped]) # go go gadget obnoxiously large comment print('######################################################') print('# BEGIN CODE AUTOGENERATED BY codegen_range_checks()') print('# Parameters:') print('# Number of samples: {:d}'.format(self.nsamples)) print('# Chance of spontaneous failure: ~{:g}'.format(failchance)) print('#') print('# The first two numbers of each line are the "range" of accepted counts in the') print('# final distribution. Ideally you want to MINIMIZE OVERLAP between ranges for') print('# different rates of occurrence (the comment in each line).') print('#') print('# The overlap can be reduced by increasing the number of samples, or by increasing') print('# failchance by a few orders of magnitude.') PER_LINE = 6 for (prob, orders), (lo, hi) in zip(probs, ranges): # put orders into compact string form in case they have been tuplefied orders = [''.join(x) for x in orders] for i in range(0, len(orders), PER_LINE): range_args = range_fmt.format(lo, hi) order_args = ', '.join(repr(x) for x in orders[i:i+PER_LINE]) # repr to quote print('self.validate_range({}, {}) # p = {:0.4f}'.format(range_args, order_args, prob)) print('# END AUTOGENERATED CODE') print('######################################################')
def main(): usage = 'usage: %prog [options] <bam_file,bam_file2,...>' parser = OptionParser(usage) parser.add_option('-c', dest='control_bam_files', help='Control BAM file to paramterize null distribution [Default: %default]') parser.add_option('-g', dest='filter_gff', help='Filter the TEs by overlap with genes in the given gff file [Default: %default]') parser.add_option('-m', dest='mapq', default=False, action='store_true', help='Consider only reads with mapq>0 [Default: %default]') parser.add_option('-r', dest='repeats_gff', default='%s/hg19.fa.out.tp.gff' % os.environ['MASK']) parser.add_option('-s', dest='strand_split', default=False, action='store_true', help='Split statistics by strand [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide a BAM file.') else: bam_files = args[0].split(',') control_bam_files = [] if options.control_bam_files: control_bam_files = options.control_bam_files.split(',') ############################################ # GFF filter ############################################ # filter TEs and read alignments by gff file if options.filter_gff: filter_merged_bed_fd, filter_merged_bed_file = tempfile.mkstemp() subprocess.call('sortBed -i %s | mergeBed -i - > %s' % (options.filter_gff, filter_merged_bed_file), shell=True) # filter TE GFF te_gff_fd, te_gff_file = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME']) subprocess.call('intersectBed -a %s -b %s > %s' % (options.repeats_gff, filter_merged_bed_file, te_gff_file), shell=True) options.repeats_gff = te_gff_file # filter BAM bam_gff_fds = [None]*len(bam_files) bam_gff_files = [None]*len(bam_files) for i in range(len(bam_files)): bam_gff_fds[i], bam_gff_files[i] = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME']) bedtools.abam_f1(bam_files[i], filter_merged_bed_file, bam_gff_files[i]) bam_files[i] = bam_gff_files[i] # filter control BAM if control_bam_files: cbam_gff_fds = [None]*len(control_bam_files) cbam_gff_files = [None]*len(control_bam_files) for i in range(len(control_bam_files)): cbam_gff_fds[i], cbam_gff_files[i] = tempfile.mkstemp(dir='%s/research/scratch/temp' % os.environ['HOME']) bedtools.abam_f1(control_bam_files[i], filter_merged_bed_file, cbam_gff_files[i]) control_bam_files[i] = cbam_gff_files[i] ############################################ # lengths ############################################ # estimate read length (just averaging across replicates for now) read_lens = [] for bam_file in bam_files: read_lens.append(estimate_read_length(bam_file)) read_len = stats.mean(read_lens) # compute size of search space if options.filter_gff: genome_length = count_bed(filter_merged_bed_file, read_len) else: genome_length = count_hg19() # hash counted repeat genomic bp if options.filter_gff: te_lengths = te_target_size_bed(options.repeats_gff, filter_merged_bed_file, read_len) else: te_lengths = te_target_size(options.repeats_gff, read_len) ############################################ # count TE fragments ############################################ fragments = [] te_fragments = [] for bam_file in bam_files: rep_fragments, rep_te_fragments = count_te_fragments(bam_file, options.repeats_gff, options.strand_split) fragments.append(rep_fragments) te_fragments.append(rep_te_fragments) if control_bam_files: control_fragments = [] control_te_fragments = [] for control_bam_file in control_bam_files: rep_fragments, rep_te_fragments = count_te_fragments(control_bam_file, options.repeats_gff, options.strand_split) control_fragments.append(rep_fragments) control_te_fragments.append(rep_te_fragments) ############################################ # combine replicates into fragment rates ############################################ te_fragment_rates = {} for (rep,fam) in te_lengths: if options.strand_split: # positive rate_list = [te_fragments[i].get((rep+'+',fam),1)/float(fragments[i]) for i in range(len(bam_files))] te_fragment_rates[(rep+'+',fam)] = stats.geo_mean(rate_list) # negative rate_list = [te_fragments[i].get((rep+'-',fam),1)/float(fragments[i]) for i in range(len(bam_files))] te_fragment_rates[(rep+'-',fam)] = stats.geo_mean(rate_list) else: rate_list = [te_fragments[i].get((rep,fam),1)/float(fragments[i]) for i in range(len(bam_files))] te_fragment_rates[(rep,fam)] = stats.geo_mean(rate_list) if control_bam_files: control_te_fragment_rates = {} for te in te_fragment_rates: rate_list = [control_te_fragments[i].get(te,1)/float(control_fragments[i]) for i in range(len(control_bam_files))] control_te_fragment_rates[te] = stats.geo_mean(rate_list) ############################################ # compute stats, print table ############################################ for (rep,fam) in te_fragment_rates: # compute TE length if options.strand_split: te_len = te_lengths[(rep[:-1],fam)] else: te_len = te_lengths[(rep,fam)] # parameterize null model if options.control_bam_files: null_rate = control_te_fragment_rates[(rep,fam)] else: if options.strand_split: null_rate = float(te_lengths[(rep[:-1],fam)]) / (2*genome_length) else: null_rate = float(te_lengths[(rep,fam)]) / genome_length # compute fragment counts count = te_fragment_rates[(rep,fam)]*sum(fragments) null_count = null_rate*sum(fragments) # compute fold change if null_rate > 0: fold = te_fragment_rates[(rep,fam)]/null_rate else: fold = 0 # compute p-value of enrichment/depletion p_val = 1.0 for i in range(len(bam_files)): if te_fragment_rates[(rep,fam)] > null_rate: p_val *= binom.sf(int(te_fragments[i].get((rep,fam),1))-1, int(fragments[i]), null_rate) else: p_val *= binom.cdf(int(te_fragments[i].get((rep,fam),1)), int(fragments[i]), null_rate) cols = (rep, fam, te_len, count, null_count, fold, p_val) print '%-18s %-18s %10d %10.1f %10.1f %10.3f %10.2e' % cols ############################################ # clean ############################################ if options.filter_gff: os.close(filter_merged_bed_fd) os.remove(filter_merged_bed_file) os.close(te_gff_fd) os.remove(te_gff_file) for i in range(len(bam_files)): os.close(bam_gff_fds[i]) os.remove(bam_gff_files[i]) if options.control_bam_files: for i in range(len(control_bam_files)): os.close(cbam_gff_fds[i]) os.remove(cbam_gff_files[i])
w=open('political_relevants.txt','w') for nd in G.nodes(): if nd not in ['red', 'blue']: Cr1, Cr2 = False, False # criterion one uni_books=len(G.node[nd]['pol_books']) if len(G.node[nd]['pol_books'])>=10: Cr2=True # criterion two p = pol_size / (base_nodes - G.node[nd]['size']) n = G.node[nd]['cops'] x = G[nd]['red']['cops'] + G[nd]['blue']['cops'] p_value = binom.sf(x, n, p) # binomial test if p_value<0.05: Cr1=True if Cr1==True and Cr2==True: #print index_category[nd], stripping(index_category[nd]) scale = (G[nd]['red']['strength']/red_books) / (G[nd]['red']['strength']/red_books + G[nd]['blue']['strength']/blue_books) # disregard this scale measure. w.write(str(nd)+'\t'+str(x)+'\t'+str(G[nd]['red']['cops'])+'\t'+str(G[nd]['blue']['cops'])+'\t'+str(scale)+ '\t'+str(n)+'\t'+str(G.node[nd]['size'])+'\t'+str(H)+'\t'+str(uni_books)+'\t'+str(format(p_value, '.10f'))+'\t'+stripping(index_category[nd])+'\n') w.close() # l.
def compPairs(fname, minScoreDiff): print "Comparing guides from %s, minimum score difference %f" % (fname, minScoreDiff) byGene = defaultdict(list) # dict gene -> list of (guideName, modFreq, scores) for row in iterTsvRows(fname): if float(row.modFreq)==0.0: continue gene = row.guide.split("-")[0] scores = {} scores["doench"] = float(row.doench) scores["ssc"] = float(row.ssc) scores["svm"] = float(row.svm) chariRaw, chariRank = lookupchariScore(row.extSeq[4:27]) scores["chariRaw"] = chariRaw scores["chariRank"] = chariRank byGene[gene].append( (row.guide, float(row.modFreq), scores) ) # keep only genes with two guides twoGuides = dict() for gene, guideList in byGene.iteritems(): if len(guideList)==2: twoGuides[gene]=guideList elif len(guideList)>2: guideList.sort(key=operator.itemgetter(1)) twoGuides[gene]=(guideList[0], guideList[1]) else: continue # for each gene, test if the order of the modFreq scores is the same as the order of the scores scoreNames = ["doench", "ssc", "svm", "chariRaw"] okCounts = defaultdict(int) for gene, guidePair in twoGuides.iteritems(): guide1, guide2 = guidePair guide1Name, guide2Name = guide1[0], guide2[0] freq1, freq2 = guide1[1], guide2[1] if abs(freq2-freq1) < minScoreDiff: #print abs(freq2-freq1), "is <0.1" logging.debug("difference not high enough") continue okCounts["all"] += 1 scores1, scores2 = guide1[2], guide2[2] logging.debug("guides (%s, %s), modFreq (%f, %f), doench (%f,%f), ssc (%f,%f)" % (guide1Name, guide2Name, freq1, freq2, scores1["doench"], scores2["doench"], scores1["ssc"], scores2["ssc"])) anyOk = False if freq2 > freq1: for scoreName in scoreNames: if scores2[scoreName] > scores1[scoreName]: logging.debug( scoreName+ " OK") okCounts[scoreName] += 1 anyOk = True else: for scoreName in scoreNames: if scores2[scoreName] < scores1[scoreName]: logging.debug( scoreName+ " OK") okCounts[scoreName] += 1 anyOk = True if not anyOk: logging.debug( "No score was OK") geneCount = okCounts["all"] print "total number of genes:", geneCount for scoreType, scoreCount in okCounts.iteritems(): if scoreType=="all": continue pVal = binom.sf(scoreCount-1,geneCount,0.5) print "%s was correct %d times (p-Val %f)" % (scoreType, scoreCount, pVal)
def upperBinom(k, n, p): """ Returns the p-value for the actual proportion being higher than p """ return binom.sf(k-1, n, p)
#!/usr/bin/env python #Copyright (c) Payton Ide 2014 #View LICENSE.txt for copyright information """ Calculates Probability of winning a game of badminton based on the probability of winning a point Author: Payton Ide See README.txt for information regarding scipy stats, the binomial module, and the uses thereof in this program """ from scipy.stats import binom #assign values as descriped in the explanation and solution file p = input("Enter probablility: ") e = binom.sf(20, 40, p, loc=0) i = binom.pmf(20, 40, p, loc=0) t = 2*p*(1-p) d = p**2 #calculate game win probabilty using the derived formula W = e + i*d + i*t*d + i*(t**2)*d + i*(t**3)*d + i*(t**4)*d + i*(t**5)*d + i*(t**6)*d + i*(t**7)*d + i*(t**8)*d + i*(t**9)*p #calculate match win probabiltiy based on game win probability V = W**2 + 2*(W**2)*(1-W) #print user-inputted point win probability, followed by calculated game and match win probabilities print "Probablity of winning a single point: ", p print "Probablity of winning a game: ", W print "Probability of winning a match: ", V
print 'Starting N =', N for p in p_range: num_Np_fails = 0 num_Np_checks = 0 for _ in xrange(repeats): if check(N, p): num_Np_fails += 1 num_Np_checks += 1 # work out what the failure probability is (approximately but not exactly 1-alpha # because it's a discrete distribution) low, high = binom.interval(alpha, N, p) if p==0: low = high = 0 elif p==1: low = high = N q = binom.cdf(low-0.1, N, p)+binom.sf(high+0.1, N, p) low, high = binom.interval(alpha, num_Np_checks, q) if q==0: low = high = 0 if num_Np_fails<low or num_Np_fails>high: print 'N=%d, p=%.3f failed %d of %d checks, outside range (%d, %d)' % (N, p, num_Np_fails, num_Np_checks, low, high) print failrate = float(numfails)/numchecks low, high = norm.interval(alpha, loc=mu, scale=sqrt(sigma2)) print '%d/%d=%.2f%% failed at %d%%' % (numfails, numchecks, numfails*100.0/numchecks, 100*alpha) print 'Expected mean=%d, std dev=%d (mean fail rate=%.2f%%)' % (mu, sqrt(sigma2), 100*mu/numchecks) if low<=numfails<=high: print 'Overall passed at %d%%: within range (%d, %d)' % (alpha*100, low, high) else: print 'Overall failed at %d%%: outside range (%d, %d)' % (alpha*100, low, high)