def fisherexacttwotails(table): """ Calculate the Fisher Exact Test with two sided, which is H0: p1 = p2 and Ha: p1 != p2 Arguments: table {list} -- [description] """ table_array = np.array(table) row0 = np.sum(table_array[0, :]) row1 = np.sum(table_array[1, :]) col0 = np.sum(table_array[:, 0]) col1 = np.sum(table_array[:, 1]) # calculate probabilities of all possible tables max_X = np.min([row0, col1]) N = row0 + row1 m = col0 k = row0 X = np.arrange(max_X + 1) possible_table_probab = hypergeom.pmf(X, N, m, k) observed_probab = hypergeom.pmf(table_array[0, 0], N, m, k) more_extreme = possible_table_probab <= observed_probab return np.sum(possible_table_probab * more_extreme) #===== end file =====
def pvalue(a_true, a_false, b_true, b_false): # Convert the a/b groups to study vs population. k = a_true n = a_false + a_true # total in study. K = a_true + b_true N = K + a_false + b_false lm = max(0, n - (N - K)) um = min(n, K) if lm == um: return PValues(1.0, 1.0, 1.0) epsilon = 1e-6 cutoff = hypergeom.pmf(k, N, K, n) left_tail = 0 right_tail = 0 two_tail = 0 for x in range(lm, um + 1): p = hypergeom.pmf(x, N, K, n) if x <= k: left_tail += p if x >= k: right_tail += p if p <= cutoff + epsilon: two_tail += p return PValues(min(left_tail, 1.0), min(right_tail, 1.0), min(two_tail, 1.0))
def two_tailed_fisher_exact(table): '''Calculate the p-value for a contingency table. Input Parameter: table : list Contains two elements, each element is a two-element list giving the row for the contingenct table. Return: The p-value for table. ''' table_array = np.array(table) #- Totals by row and columns: total_row0 = np.sum(table_array[0, :]) total_row1 = np.sum(table_array[1, :]) total_col0 = np.sum(table_array[:, 0]) #total_col1 = np.sum(table_array[:,1]) possible_tables_probab = [] max_X = np.min([total_row0, total_col0]) N = total_row0 + total_row1 m = total_col0 k = total_row0 X = np.arange(max_X + 1) possible_tables_probab = hypergeom.pmf(X, N, m, k) observed_probab = hypergeom.pmf(table_array[0, 0], N, m, k) #- Return sum of probabilities more extreme than the observed. more_extreme = possible_tables_probab <= observed_probab return np.sum(possible_tables_probab * more_extreme)
def validate_over_represented(g,edgeAttr,sig,correcting,isDirected): weights = [e[2][edgeAttr]for e in g.edges(data=True)] sumWeights = int(np.sum(weights)) pmfHyper = {} pval = {} validatedDict = {} if correcting == True: multivariateSignificanceCorrection = sig/float(g.number_of_edges())#Bonferroni else: multivariateSignificanceCorrection = sig #find the probability of each weight for the hypergeometric null model for e in g.edges_iter(data=True): source = e[0] target = e[1] weight = e[2][edgeAttr] if isDirected: sout = g.out_degree(source,weight=edgeAttr) sin = g.in_degree(target,weight=edgeAttr) else: sout = g.degree(source,weight=edgeAttr) sin = g.degree(target,weight=edgeAttr) pmfHyper[(source,target)] = hypergeom.pmf(weight,sumWeights ,sout, sin, loc=0) #now find the p-value for e in g.edges_iter(data=True): source = e[0] target = e[1] weight = e[2][edgeAttr] #print source,target,weight pmfHyper[(source,target)] = hypergeom.pmf(weight,sumWeights ,sout, sin, loc=0) if isDirected: sout = g.out_degree(source,weight=edgeAttr) sin = g.in_degree(target,weight=edgeAttr) else: sout = g.degree(source,weight=edgeAttr) sin = g.degree(target,weight=edgeAttr) weight = e[2][edgeAttr] lowerSumLim = int(weight) upperSumLim = int(sin) if sout < sin: upperSumLim = int(sout) pval[(source,target)] = 0 for X in range(lowerSumLim,upperSumLim+1): pval[(source,target)] += hypergeom.pmf(X,sumWeights ,sout, sin, loc=0) #now apply the statistical correction for performing num edges tests for source,target in pval: if pval[(source,target)] < multivariateSignificanceCorrection: validatedDict[(source,target)] = pval[(source,target)] return validatedDict.keys()
def accumulative_hypergeometric(k, n, K, N): ''' [k]: SUCCESS IN THE CLUSTER [n]: SIZE OF THE CLUSTER [K]: SUCCESS IN POPULATION [N]: SIZE OF THE POPULATION ''' k, n, K, N = int(k), int(n), int(K), int(N) sf = hyp.sf(k, N, K, n) if sf < 1: return sf + hyp.pmf(k, N, K, n) else: return 1 - hyp.cdf(k, N, K, n) + hyp.pmf(k, N, K, n)
def compute_risk(self, sub_audit: PairwiseAudit, votes_for_winner: int = None, current_round: int = None, *args, **kwargs) -> float: """Compute the risk level given current round size, votes for winner in sample, and subaudit. The risk level is computed using the normalized product of the prior and posterior distributions. The prior comes from compute_prior() and the posterior is the hypergeometric distribution of finding votes_for_winner from a sample of size current_round taken from a total size of contest_ballots. The risk is defined as the lower half of the distribution, i.e. the portion of the distribution associated with an incorrectly reported outcome. Args: sample (int): Votes found for reported winner in current round size. current_round(int): Current round size. sub_aduit (PairwiseAudit): Subaudit to generate risk value. Returns: float: Value for risk of given sample and round size. """ posterior = np.array( hg.pmf(votes_for_winner, sub_audit.sub_contest.contest_ballots, np.arange(sub_audit.sub_contest.contest_ballots + 1), current_round)) posterior = sub_audit.prior * posterior normalize = sum(posterior) if normalize > 0: posterior = posterior / normalize return sum(posterior[range( math.floor(sub_audit.sub_contest.contest_ballots / 2) + 1)])
def calc_hg_enrichment_pval(mat, a, arm_a, aneu_type_a, b, arm_b, aneu_type_b): n_overlap = np.sum( np.logical_and( mat.loc[:, "{}{}".format(a, arm_a)].values == aneu_type_a, mat.loc[:, "{}{}".format(b, arm_b)].values == aneu_type_b)) n_a = np.sum( np.logical_and( mat.loc[:, "{}{}".format(a, arm_a)].values == aneu_type_a, mat.loc[:, "{}{}".format(b, arm_b)].values != aneu_type_b)) n_b = np.sum( np.logical_and( mat.loc[:, "{}{}".format(a, arm_a)].values != aneu_type_a, mat.loc[:, "{}{}".format(b, arm_b)].values == aneu_type_b)) # pval=hypergeom.sf(n_overlap, mat.shape[0], n_overlap+n_a, n_overlap+n_b) \ # + hypergeom.pmf(n_overlap, mat.shape[0], n_overlap+n_a, n_overlap+n_b) pval=hypergeom.sf(n_overlap, mat.shape[0], n_overlap+n_a, n_overlap+n_b) \ + hypergeom.pmf(n_overlap, mat.shape[0], n_overlap+n_a, n_overlap+n_b) # n_overlap+n_a+n_overlap+n_b # tbl=[[n_overlap, n_b], [n_a, mat.shape[0]-(n_overlap+n_b+n_a)]] # pval_1=fisher_exact(tbl, 'greater') # if a==1 and arm_a=='p' and aneu_type_a==-1 and b==2 and arm_b=='q' and aneu_type_b==-1: # print (n_overlap, mat.shape[0], n_overlap+n_a, n_overlap+n_b) # print pval, pval_1[1] return pval
def beta(u_i,i,n): #beta(i,u_i)=P(U_i=u_i) concat_range=chain(range(int(n/2)),range(int(n/2)+1,n+1)) prob=0 for j in concat_range: prob+=hypergeom.pmf(u_i,n,j,i) return prob/n
def plot(self, x, n, p): pmf = hypergeom.pmf(x, n, p) plt.plot(x, pmf, 'o-') plt.title('HyperGeometric: n=%i , p=%.2f' % (n, p), fontsize='value') plt.xlabel('Number of successes') plt.ylable('Probability of Successes', fontsize='value') plt.show()
def test_alg1_alg2_accuracy_difference(): a = 20 bvals = np.arange(50, 100, dtype=np.int64) tol = 1e-12 k = bvals.size truth = np.zeros(k, dtype=np.float64) pval1 = np.zeros(k, dtype=np.float64) pval2 = np.zeros(k, dtype=np.float64) for i, b in enumerate(bvals): b = int(b) N = a + b K = a X = 1 L = N n = a k = K truth[i] = hypergeom.pmf(k, N, K, n) v = np.r_[np.ones(a, dtype=np.uint8), np.zeros(b, dtype=np.uint8)] stat, n_star = mhg.get_xlmhg_stat(v, X, L) p1 = mhg.get_xlmhg_pval1(N, K, X, L, stat) p2 = mhg.get_xlmhg_pval2(N, K, X, L, stat) pval1[i] = p1 pval2[i] = p2 assert np.all(~np.isnan(pval2)) assert np.any(np.isnan(pval1)) for i in range(bvals.size): assert mhg.is_equal(pval2[i], truth[i], tol=tol)
def test_nhypergeom_pmf(): # test with hypergeom M, n, r = 45, 13, 8 k = 6 NHG = nhypergeom.pmf(k, M, n, r) HG = hypergeom.pmf(k, M, n, k+r-1) * (M - n - (r-1)) / (M - (k+r-1)) assert_allclose(HG, NHG, rtol=1e-10)
def calcPValues(n, N1, params, P_actu): return [ hypergeom.sf(P_actu[i], N1, params[i], n) + 0.5 * hypergeom.pmf(P_actu[i], N1, params[i], n) for i in range(len(P_actu)) ]
def probability_same(assignment): D = assignment.shape[0] sum_a = 0 for i in range(assignment.shape[1]): N = sum(assignment[:, i]) if N > 0: sum_a += hypergeom.pmf(2, D, 2, N) sum_b = 0 for i in range(assignment.shape[1]): for j in range(i + 1, assignment.shape[1]): N = len(intersection(assignment, i, assignment, j)) if N > 0: sum_b += hypergeom.pmf(2, D, 2, N) return sum_a - sum_b
def cHgPvl(x,M,n,N): """ x=randVar M=popSize n=totalSuccesses N=samplSize """ return 1-hypergeom.cdf(x,M,n,N)+hypergeom.pmf(x,M,n,N)
def test_hypergeom(): """ Compare mine with scipy.stats. :return: """ from scipy.stats import hypergeom import time # M = populacja, N = liczba prób, n = liczba sukcesów, k = punkt pop = 100 trials = 50 successes = 30 failures = pop - successes kmin = max(0, trials - failures) kmax = min(trials, successes) ss_list = [ hypergeom.pmf(k=k, N=trials, M=pop, n=successes) for k in range(kmin, kmax + 1) ] my_list = list(hypergeom_pmf_iterator(N=pop, k=successes, n=trials)) expect = trials * float(successes) / pop iterations = 100 ss_results = [0 for _ in range(iterations)] ss_start_time = time.time() for iter in range(iterations): ss_expect = sum(i * hypergeom.pmf(k=i, N=trials, M=pop, n=successes) for i in range(kmin, kmax + 1)) ss_results[iter] = ss_expect ss_end_time = time.time() ss_time = ss_end_time - ss_start_time my_results = [0 for _ in range(iterations)] my_start_time = time.time() for iter in range(iterations): my_expect = sum((i + kmin) * pr for i, pr in enumerate( hypergeom_pmf_iterator(N=pop, k=successes, n=trials))) my_results[iter] = my_expect my_end_time = time.time() my_time = my_end_time - my_start_time print "Average scipy error:" print sum(abs(r - expect) for r in ss_results) / iterations print "Average my implementation error:" print sum(abs(r - expect) for r in my_results) / iterations print "Scipy execution time:" print ss_time print "My implementation execution time:" print my_time
def current_dist_null(self): """Update distribution_null for current round.""" #print("len(self.rounds) = " + str(len(self.rounds))) if len(self.rounds) == 1: round_draw = self.rounds[0] else: round_draw = self.rounds[-1] - self.rounds[-2] # Compute the underlying number of winner votes under the null # (Note that if null_margin is 0, then this simplifies to a tie) Nw_exact = (self.contest.contest_ballots + self.null_margin) / 2 Nw = math.floor(Nw_exact) #print("x*: "+str(Nw)) # Distribution updating is dependent on sampling with or without replacement if self.replacement: distribution_round_draw = binom.pmf(range(0, round_draw + 1), round_draw, Nw_exact / self.contest.contest_ballots) # Compute convolution to get new distribution (except 1st round) if len(self.rounds) == 1: self.distribution_null = distribution_round_draw else: self.distribution_null = fftconvolve(self.distribution_null, distribution_round_draw) else: if len(self.rounds) == 1: # Simply compute hypergeometric for 1st round distribution self.distribution_null = hypergeom.pmf(np.arange(round_draw + 1), self.contest.contest_ballots, Nw, round_draw) else: distribution_round_draw = [0 for i in range(self.rounds[-1] + 1)] # Get relevant interval of previous round distribution interval = self.__get_interval(self.distribution_null) # For every possible number of winner ballots in previous rounds # and every possibility in the current round # compute probability of their simultaneity for prev_round_possibility in range(interval[0], interval[1] + 1): unsampled_contest_ballots = self.contest.contest_ballots - self.rounds[-2] unsampled_winner_ballots = Nw - prev_round_possibility curr_round_draw = hypergeom.pmf(np.arange(round_draw + 1), unsampled_contest_ballots, unsampled_winner_ballots, round_draw) for curr_round_possibility in range(round_draw + 1): component_prob = self.distribution_null[prev_round_possibility] * curr_round_draw[curr_round_possibility] distribution_round_draw[prev_round_possibility + curr_round_possibility] += component_prob self.distribution_null = distribution_round_draw
def test_nch_hypergeom(self, dist_name): # Both noncentral hypergeometric distributions reduce to the # hypergeometric distribution when odds = 1 dists = {'nchypergeom_fisher': nchypergeom_fisher, 'nchypergeom_wallenius': nchypergeom_wallenius} dist = dists[dist_name] x, N, m1, n = self.x, self.N, self.m1, self.n assert_allclose(dist.pmf(x, N, m1, n, odds=1), hypergeom.pmf(x, N, m1, n))
def main(): args = parser.parse_args() x,n,M,N = args.x,args.n,args.M,args.N verify(x,n,M,N) total_probablity = 0 for i in range(x,min(n,M)+1,1): probablity = hypergeom.pmf(i,N,M,n) total_probablity += probablity print total_probablity
def hg(cardname, quantity, percent): try: actualsuccesses = carddict[cardname] decksize = 60 successdraws = quantity cardsdrawn = sum(carddict.values()) return hypergeom.pmf(k=actualsuccesses, M=decksize, n=successdraws, N=cardsdrawn) * percent except KeyError: return 0
def fisher_exact_test_pval(a, x, n, N): if int(x) == 0 or int(x) == N: pval = 1 else: prob_mass = hypergeom.pmf(np.arange(N + 1), N, n, int(x)) left_tail = prob_mass[:(a + 1)].sum() right_tail = prob_mass[a:].sum() pval = np.minimum(left_tail, right_tail) return pval
def epsilon(u_i,i,n): #epsilon(u_i,i,n)=P(X_i+1 = 1 | U_i=u_i) prob=0 b = beta(u_i,i,n) concat_range=chain(range(int(n/2)),range(int(n/2)+1,n+1)) for k in concat_range: prob+=gamma(k,u_i,i,n)*hypergeom.pmf(u_i,n,k,i) prob=prob/(n*b) return prob
def functional_enrichment(dicbp2, N, geneset_toenrich, type_correction, pvalue_threshold=0.05): """ Calculate the functional enrichment using the method described by Carlota. Parameters: @dicbp2: A dictionary containing the associations of functions to their corresponding genes @N: The total number of genes @geneset_toenrich: The gene set to check the enrichment @type_correction: The multiple test p-value correction (fdr_bh / bonferroni) @pvalue_threshold: P-value threshold of the multiple test p-value correction """ pvals = {} genes_enriched = {} go_enriched = False k = len(geneset_toenrich) terms_l = [] test_passed_f = False term_to_values = {} for term in dicbp2.keys(): m = len(dicbp2[term]) xl = [y for y in dicbp2[term] if y in geneset_toenrich] x = len(xl) if x != 0: go_enriched = True xlist = [] for i in range(x, m + 1): xlist.append(i) # calculation of the hypervalue dhypervalue = hypergeom.pmf(xlist, N, m, k) # threshold of enrichment pvals[term] = sum(dhypervalue) genes_enriched[term] = [x, m] # Quim: addition to get genes enriched if go_enriched: pvals_values = list(pvals.values()) terms = list(pvals.keys()) pvals_corrected = multipletests(pvals_values, alpha=0.05, method=type_correction, is_sorted=False, returnsorted=False) for i in range(0, len(terms)): if list(pvals_corrected[1])[i] < pvalue_threshold: # Quim: addition to get the p-values and genes enriched pval = pvals_values[i] pval_corrected = list(pvals_corrected[1])[i] term = terms[i] x, m = genes_enriched[term] term_to_values[term] = [pval, pval_corrected, x, m] #################################### test_passed_f = True terms_l.append(terms[i]) return test_passed_f, terms_l, term_to_values
def wang_cpci(n, N, M, lciw, uciw): """Wang method: the coverage probability function.""" kk = list(range(len(M))) for ii in kk: indp = list(range(n + 1)) for j in range(n + 1): indp[j] = wang_ind(M[ii], lciw[j], uciw[j])\ * hypergeom.pmf(j, N, M[ii], n) kk[ii] = sum(indp) return kk
def calcH(N, Z): # Input: N group size, Z population size # Output: H[k,K] hypergeometric function (k individuals in group, K individuals in population) import numpy as np from scipy.stats import hypergeom H = np.zeros((N + 1, Z + 1)) for K in range(0, Z + 1): for k in range(0, N + 1): H[k, K] = hypergeom.pmf(k, Z, K, N) return H
def geometric_test(G, list1, list2, genes): H = G.subgraph(genes) overlap = 119 M = len(list(G.subgraph(list1))) n = len(list(G.subgraph(list2))) N = 10261 print overlap, M, n, N pval = hypergeom.pmf(overlap - 1, M, n, N) print pval
def matrix_double_sample_selection(n, N, s=0): mtx = np.zeros((2*n + 1, n + 1)) # cache = [[np.full((i+1, j+1), np.nan) for j in range(1, n+1)] for i in range(1, 2*n+1)] cache = np.full((2*n+1, 2*n+1, 2*n+1, 2*n+1), np.nan) for ip in range(0, 2*n + 1): for io in range(0, n + 1): for nc in range(0, 2*n+1): for ic in range(0, nc+1): mtx[ip, io] += Qs(io, n, ic, nc, N, s, cache) * hypergeom.pmf(ic, 2*n, ip, nc) return mtx, cache
def matrix(no=5, s=1 / 1000, N=1000, max_t=1): mtx = np.zeros((no + 1, no + 1)) # mtx = Q in text (i_o,i_p) cache_P0 = np.full((no+1, no+1, no+1, no+1), np.nan) # p0 = T_0 in text; arguments (i_o,n_o,i_p,n_p) cache_Pf = np.full((no+1, no+1, no+1, no+1, max_t+1), np.nan) # pf = T_r (i_o,n_o,i_p,n_p,r) for nc in range(0, no + 1): # n_c is n_p in text for ic in range(0, nc + 1): # vectorized over ip p = hypergeom.pmf(ic, no, np.arange(0, no + 1), nc) for io in range(0, no + 1): mtx[:, io] += P0(io, no, ic, nc, s, N, max_t, cache_P0, cache_Pf) * p return mtx, cache_P0, cache_Pf
def matrix_selection(n, N, s=0): mtx = np.zeros((n + 1, n + 1)) cache = np.full((n+1, n+1, n+1, n+1), np.nan) # this is messier (we allocate way more than we need, but it's easier for numba) # cache = [[np.full((i+1, j+1), np.nan) for j in range(1, n+1)] for i in range(1, n+1)] # this is the minimal size allocation possible for nc in range(0, n + 1): for ic in range(0, nc + 1): # vectorized over ip p = hypergeom.pmf(ic, n, np.arange(0, n+1), nc) for io in range(0, n + 1): mtx[:, io] += Qs(io, n, ic, nc, N, s, cache) * p return mtx, cache
def probability_class_cluster(clazz, klust): D = klust.shape[0] sum_a = 0 for i in range(klust.shape[1]): for j in range(clazz.shape[1]): N = len(intersection(klust, i, clazz, j)) if N > 0: sum_a += hypergeom.pmf(2, D, 2, N) sum_b = 0 for i in range(klust.shape[1]): for j in range(clazz.shape[1]): for ip in range(i + 1, klust.shape[1]): for jp in range(j + 1, clazz.shape[1]): s_ij = intersection(klust, i, clazz, j) s_ipjp = intersection(klust, ip, clazz, jp) N = len(numpy.intersect1d(s_ij, s_ipjp)) if N > 0: sum_b += hypergeom.pmf(2, D, 2, N) return sum_a - sum_b
def cpci(M): kk = np.arange(len(M)).astype(np.float64) for i in np.arange(len(M)): xx = np.arange(n + 1) indp = xx.astype(np.float64) uu = 0 while (uu < n + 0.5): indp[uu] = (ind(M[i], lciw[uu], uciw[uu]) * hypergeom.pmf(uu, N, M[i], n)) uu += 1 kk[i] = sum(indp) return kk
def matrix(no=5, s=1 / 1000, N=1000, max_t=1): mtx = np.zeros((no + 1, no + 1)) cache_P0 = np.full((no + 1, no + 1, no + 1, no + 1), np.nan) cache_Pf = np.full((no + 1, no + 1, no + 1, no + 1, max_t + 1), np.nan) for nc in range(0, no + 1): for ic in range(0, nc + 1): # vectorized over ip p = hypergeom.pmf(ic, no, np.arange(0, no + 1), nc) for io in range(0, no + 1): mtx[:, io] += P0(io, no, ic, nc, s, N, max_t, cache_P0, cache_Pf) * p return mtx, cache_P0, cache_Pf
def checktheory(thres, n, ne, p, te, s): # calculate failure probability for a certain number of ones in se pfaildict = {} for se in range(0, n + 1): tmp = [binom.sf((thres + i + se) / 2, se, p=0.5) * s[i] for i in s] tmp2 = [binom.sf((thres + 1 + i + se) / 2, se, p=0.5) * s[i] for i in s] pfail = 1.5 * sum(tmp) + 0.5 * sum(tmp2) pfaildict[se] = pfail # set everything to zero fail = 0 fail2 = {} for te1 in te: fail2[te1] = 0 # loop over all norm values for l1, l2 in tqdm(itertools.combinations_with_replacement(range(0, n + 1), 2), leave=False, total=n * (n + 1) / 2): # probability of a certain norm pl1 = binom.pmf(l1, n=n, p=p) pl2 = binom.pmf(l2, n=n, p=p) pl = pl1 * pl2 if l1 != l2: pl *= 2 # skip if probability is too small if pl < 2**-200: continue # calculate the probability of a failure failtmp = 0 # loop over all possible number of nonzero elements in se for se1 in range(max(0, l1 + l2 - n), min(l1, l2) + 1): # probability of number of nonzero elements in se pse = hypergeom.pmf(k=se1, M=n, n=l1, N=l2) # probability of failure for a certain se1 pfail = pfaildict[se1] # weighted average share failtmp += pse * pfail # for new model, take error correction into account fail += pl * failtmp for te1 in te: fail2[te1] += pl * LACprob(failtmp, ne, te1) new = [] old = [] for te1 in te: # for old model, take error correction into account old.append(LACprob(fail, ne, te1)) new.append(fail2[te1]) return new, old
def rho(i,u_i,n): #=number of boxes #Rho(i,x) = P(u_12 >= 7 | U_i=u_i) = P(red majority given U_i=u_i) #numerator=P(u_12 >= 7 and u_i=x) = P(majority red and u_i=x) nume=0 for j in range(int((n/2))+1,n+1): #P(u_i=x|u_n=j)=hypergeom.pmf(x,n,j,i) nume+=hypergeom.pmf(u_i,n,j,i) nume=nume/n #denumerator=P(U_i=u_i) denume=beta(u_i,i,n) return nume/denume
def visualize(hgt_preprocessing_file_name): HGTs = np.load(os.path.join(BASE_OUTPUT_DIR, hgt_preprocessing_file_name)) HGTs[HGTs == 0] = -1 B, N = np.shape(HGTs) B -= 1 N -= 1 mHGT = 0.0002 left_tails = [(hypergeom.sf(i, N, B, i) + hypergeom.pmf(i, N, B, i)) for i in range(0, B + 1)] # top_tails = [(hypergeom.sf(B, N, B, i) + hypergeom.pmf(B, N, B, i)) for i in range(0, N + 1)] # for i, cur in enumerate(left_tails): if cur <= mHGT: left_edge = (i, i) break for i, cur in enumerate(top_tails): if cur >= mHGT: top_edge = (i - 1, B) break slope = (left_edge[1] - top_edge[1]) / ((left_edge[0] - top_edge[0]) * 1.0) constant = top_edge[1] - slope * top_edge[0] cmap = colors.ListedColormap(["red", 'gray', 'skyblue', "red"]) bounds = [-1, -0.1, mHGT, 1, 1] norm = colors.BoundaryNorm(bounds, cmap.N) fig, ax = plt.subplots() ax.imshow(HGTs, cmap=cmap, norm=norm) # draw gridlines ax.grid(which='minor', axis='both', linestyle='-', color='k', linewidth=1) ax.invert_yaxis() # ax.set_xticks(np.arange(-.5, 10, 1)); ax.set_yticks(np.arange(0, 101, 8000)) plt.plot([left_edge[0], top_edge[0]], [left_edge[1], top_edge[1]], "green") plt.show() x = 1
def compute_pmatrix(X): """Computes the p-matrix of input binary matrix `X` using hypergeometric pmf. See Lima-Mendez 2008. NOTE: Much slower than MATLAB. Investigate""" (N, n) = X.shape P = np.zeros((N, N)) for i in range(N): print "%d / %d" % (i + 1, N) a = sum(X[i, :]) for j in range(i + 1, N): b = sum(X[j, :]) c = np.dot(X[i, :], X[j, :]) C = min(a, b) if C is not 0: P[i, j] = sum(hypergeom.pmf(range(c, C + 1), n, a, b)) else: # C==0 will yield a nan in hypergeom formula P[i, j] = 1 P = P + P.T return P
def compute_pval(ra, rb): from scipy.stats import hypergeom """ Compute the pval between two binary gene profiles. pval is defined using hypergeometric formula. Input: ra: series rb: series """ n = len(ra) a = sum(ra) b = sum(rb) c = np.dot(ra, rb) C = min(a ,b) if C is not 0: pval = sum(hypergeom.pmf(range(c, C+1), n, a, b)) else: pval = 1 return pval
def plot_cell_enrichments( ds, f=None, enrichments=None, ax=None, title=None, ): LOGGER.info("Plotting cell type enrichments") if enrichments is None: enrichments = brs.enrichments.get_enrichments( list(ds.species)[0], ) if f is None: f = {'p': .05, 'asym_fold': 1.25} if isinstance(f, dict): f = [f] if ax is None: _, ax = plt.subplots(figsize=(4, 3)) cell_prots = { cell: [key for key, val in enrichments.items() if val == cell] for cell in brs.CELL_TYPES } display_name = { 'Myelinating Oligodendrocytes': 'Oligodendrocytes', } if sum([ 'Oligo' in cell and len(cell_prots.get(cell, [])) > 0 for cell in brs.CELL_TYPES ]) else {} vals = [] ds = ds.filter( protein=set(j for i in cell_prots.values() for j in i), fn=lambda x: len(x['Proteins']) < 2, ) hatches = [ "", "//", "o", "x", ".", ] for cell in brs.CELL_TYPES: for ind, fil in enumerate(f): dc = ds.filter(protein=set(cell_prots[cell])) fore_hits = dc.filter(fil).shape[0] fore_size = dc.shape[0] back_hits = ds.filter(fil).shape[0] back_size = ds.shape[0] if fore_size < 1 or back_size < 1: continue val = ( 1 - hypergeom.cdf(fore_hits, back_size, back_hits, fore_size) + hypergeom.pmf(fore_hits, back_size, back_hits, fore_size) ) vals.append( pd.Series( OrderedDict([ ('cell', display_name.get(cell, cell)), ('fore hits', fore_hits), ('fore size', fore_size), ('back hits', back_hits), ('back size', back_size), ('p-value', val), ('-log10 p-value', -np.log10(val)), ('color', brs.CELL_COLORS[cell]), ('hatch', hatches[ind % len(hatches)]), ('hue', format_title(f=fil)), ]) ) ) df = pd.DataFrame(vals) ax = sns.barplot( data=df, y='cell', x='-log10 p-value', hue='hue', ax=ax, ) ax.axvline(-np.log10(.01), color='k', linestyle=':') ax.legend( handles=[ mpatches.Patch( facecolor='w', edgecolor='k', hatch=i, label=df['hue'].iloc[ind], ) for ind, i in enumerate(hatches[:len(f)]) ] ) for hatch, color, p in zip( df['hatch'], df['color'], sorted(ax.patches, key=lambda x: x.xy[1]), ): p.set_hatch(hatch) p.set_facecolor(color) p.set_edgecolor('k') if title: ax.set_title(title) ax.set_ylabel('') ax.set_xlabel('p-value') ax.set_xticklabels(['{:.3}'.format(10 ** -i) for i in ax.get_xticks()]) return ax.get_figure(), ax
def calculate_downsampled_estimate(i,n,m): downsampled_estimate = numpy.zeros(m+1) js = numpy.arange(max([0,m-(n-i)]),min([i,m])+1) downsampled_estimate[js] += hypergeom.pmf(js,n,i,m) return downsampled_estimate[1:-1]
write.writelines('#Human phenotype: ' + humanPhenotypeName) write.writelines('\n#\tAnalysis run ' + str(today.year) + '-' + str(today.month) + '-' + str(today.day)) write.writelines('\n#\t' + str(len(set(humanMicrocephalyGenes))) + ' genes of ' + str(len(humanToHom)) + ' give rise to this phenotype') write.writelines('\n#Zebrafish phenotype: ' + zfishAnatomy[0]+ ' ' + zfishPhenotype[0]) p=1 while p<len(zfishAnatomy): write.writelines(',' + zfishAnatomy[p]+ ' ' + zfishPhenotype[p]) p+=1 write.writelines('\n#\t' + str(numFishMicrocephaly) + ' of ' + str(numFishGenes) + ' fish genes match phenotype (' + str(100.*numFishMicrocephaly/numFishGenes) + '%)') write.writelines('\n#\t' + str(fishHasMicrocephaly) + ' of ' + str(geneInFish) + ' fish with human genes match phenotype (' + str(100.*fishHasMicrocephaly/geneInFish) + '%)') fishPval=hypergeom.pmf(fishHasMicrocephaly,numFishGenes,numFishMicrocephaly,geneInFish) write.writelines('\n#\tp-value = ' + str(fishPval) + ' by hypergeometic test') write.writelines('\n#Mouse phenotype: ' + mousePhenotypesReference[0]) for pheno in mousePhenotypesReference[1:]: write.writelines(',' + pheno) write.writelines('\n#\t' + str(numMouseMicrocephaly) + ' of ' + str(numMouseGenes) + ' mice genes match phenotype (' + str(100.*numMouseMicrocephaly/numMouseGenes) + '%)') write.writelines('\n#\t' + str(mouseHasMicrocephaly) + ' of ' + str(geneInMouse) + ' mice with human genes match phenotype (' + str(100.*mouseHasMicrocephaly/geneInMouse) + '%)') mousePval=hypergeom.pmf(mouseHasMicrocephaly,numMouseGenes,numMouseMicrocephaly,geneInMouse) write.writelines('\n#\tp-value = ' + str(mousePval) + ' by hypergeometic test') write.writelines(toWrite) write.close() if fishHasMicrocephaly>0:
def fisher_exact(c) : """Performs a Fisher exact test on a 2x2 contingency table. Parameters ---------- c : array_like of ints A 2x2 contingency table. Returns ------- oddsratio : float This is prior odds ratio and not a posterior estimate. p-value : float P-value for 2-sided hypothesis of independence. Examples -------- >>> fisher_exact([[100, 2], [1000, 5]]) (0.25, 0.13007593634330314) """ c = np.asarray(c, dtype=np.int64) # int32 is not enough for the algorithm odssratio = c[0,0] * c[1,1] / float(c[1,0] * c[0,1]) \ if (c[1,0] > 0 and c[0,1] > 0) else np.inf n1 = c[0,0] + c[0,1] n2 = c[1,0] + c[1,1] n = c[0,0] + c[1,0] mode = int(float((n + 1) * (n1 + 1)) / (n1 + n2 + 2)) pexact = hypergeom.pmf(c[0,0], n1 + n2, n1, n) pmode = hypergeom.pmf(mode, n1 + n2, n1, n) epsilon = 1 - 1e-4 if float(np.abs(pexact - pmode)) / np.abs(np.max(pexact, pmode)) <= 1 - epsilon: return odssratio, 1 elif c[0,0] < mode: plower = hypergeom.cdf(c[0,0], n1 + n2, n1, n) if hypergeom.pmf(n, n1 + n2, n1, n) > pexact / epsilon: return odssratio, plower # Binary search for where to begin upper half. min = mode max = n guess = -1 while max - min > 1: guess = max if max == min + 1 and guess == min else (max + min) / 2 pguess = hypergeom.pmf(guess, n1 + n2, n1, n) if pguess <= pexact and hypergeom.pmf(guess - 1, n1 + n2, n1, n) > pexact: break elif pguess < pexact: max = guess else: min = guess if guess == -1: guess = min while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon: guess -= 1 while hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon: guess += 1 p = plower + hypergeom.sf(guess - 1, n1 + n2, n1, n) if p > 1.0: p = 1.0 return odssratio, p else: pupper = hypergeom.sf(c[0,0] - 1, n1 + n2, n1, n) if hypergeom.pmf(0, n1 + n2, n1, n) > pexact / epsilon: return odssratio, pupper # Binary search for where to begin lower half. min = 0 max = mode guess = -1 while max - min > 1: guess = max if max == min + 1 and guess == min else (max + min) / 2 pguess = hypergeom.pmf(guess, n1 + n2, n1, n) if pguess <= pexact and hypergeom.pmf(guess + 1, n1 + n2, n1, n) > pexact: break elif pguess <= pexact: min = guess else: max = guess if guess == -1: guess = min while hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon: guess += 1 while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon: guess -= 1 p = pupper + hypergeom.cdf(guess, n1 + n2, n1, n) if p > 1.0: p = 1.0 return odssratio, p
def _motif_sig(fore_hits, fore_size, back_hits, back_size): return ( 1 - hypergeom.cdf(fore_hits, back_size, back_hits, fore_size) + hypergeom.pmf(fore_hits, back_size, back_hits, fore_size) )
def validate_over_under_represented_fast(g,edgeAttr,sig,correct,isDirected): weights = [e[2][edgeAttr] for e in g.edges(data=True)] sumWeights = int(np.sum(weights)) pmfHyper = {} pvalOver = {} pvalUnder = {} validatedOver = {} validatedUnder = {} e = g.number_of_edges() nB = len([u for u in g if g.in_degree(u) > 0]) nL = len([u for u in g if g.out_degree(u) > 0]) nLB = len([u for u in g if g.out_degree(u) > 0 and g.in_degree(u) > 0]) T = e + nB*nL - nLB print "e: " + str(e) print "nB: " + str(nB) print "nL: " + str(nL) print "nLB: " + str(nLB) print "T: " + str(T) if correct == True: multivariateSignificanceCorrection = sig/float(T)#Bonferroni else: multivariateSignificanceCorrection = sig #find the probability of each weight for the hypergeometric null model for source,nbrsdict in g.adjacency_iter(): for target,keydict in nbrsdict.iteritems(): for key,eattr in keydict.iteritems(): if key == edgeAttr: if isDirected: sout = g.out_degree(source,weight=edgeAttr) sin = g.in_degree(target,weight=edgeAttr) else: sout = g.degree(source,weight=edgeAttr) sin = g.degree(target,weight=edgeAttr) pmfHyper[(source,target)] = hypergeom.pmf(eattr,sumWeights ,sout, sin, loc=0) #now find the p-value for source,nbrsdict in g.adjacency_iter(): for target,keydict in nbrsdict.iteritems(): for key,eattr in keydict.iteritems(): if key == edgeAttr: if isDirected: sout = g.out_degree(source,weight=edgeAttr) sin = g.in_degree(target,weight=edgeAttr) else: sout = g.degree(source,weight=edgeAttr) sin = g.degree(target,weight=edgeAttr) #do the over validation lowerSumLim = int(eattr) upperSumLim = int(sin) if sout < sin: upperSumLim = int(sout) pvalOver[(source,target)] = 0 for X in range(lowerSumLim,upperSumLim+1): pvalOver[(source,target)] += hypergeom.pmf(X,sumWeights ,sout, sin, loc=0) #do the under validation lowerSumLim = 0 upperSumLim = int(eattr) pvalUnder[(source,target)] = 0 for X in range(lowerSumLim,upperSumLim+1): pvalUnder[(source,target)] += hypergeom.pmf(X,sumWeights ,sout, sin, loc=0) #now apply the statistical correction for performing num edges tests for source,target in pvalOver: if pvalOver[(source,target)] < multivariateSignificanceCorrection: #reject the null hypothesis if True validatedOver[(source,target)] = pvalOver[(source,target)] if pvalUnder[(source,target)] < multivariateSignificanceCorrection: #reject the null hypothesis if True validatedUnder[(source,target)] = pvalUnder[(source,target)] #sorted_pvalOver = sorted(pvalOver.iteritems(), key=operator.itemgetter(1)) return (validatedOver.keys(),validatedUnder.keys())
def validate_over_represented_fast(g,edgeAttr,sig,correct,isDirected): weights = [e[2][edgeAttr] for e in g.edges(data=True)] sumWeights = int(np.sum(weights)) pmfHyper = {} pval = {} validatedDict = {} if correct == True: multivariateSignificanceCorrection = sig/float(g.number_of_edges())#Bonferroni else: multivariateSignificanceCorrection = sig #find the probability of each weight for the hypergeometric null model for source,nbrsdict in g.adjacency_iter(): for target,keydict in nbrsdict.iteritems(): for key,eattr in keydict.iteritems(): if key == edgeAttr: if isDirected: sout = g.out_degree(source,weight=edgeAttr) sin = g.in_degree(target,weight=edgeAttr) else: sout = g.degree(source,weight=edgeAttr) sin = g.degree(target,weight=edgeAttr) pmfHyper[(source,target)] = hypergeom.pmf(eattr,sumWeights ,sout, sin, loc=0) #now find the p-value for source,nbrsdict in g.adjacency_iter(): for target,keydict in nbrsdict.iteritems(): for key,eattr in keydict.iteritems(): if key == edgeAttr: if isDirected: sout = g.out_degree(source,weight=edgeAttr) sin = g.in_degree(target,weight=edgeAttr) else: sout = g.degree(source,weight=edgeAttr) sin = g.degree(target,weight=edgeAttr) lowerSumLim = int(eattr) upperSumLim = int(sin) if sout < sin: upperSumLim = int(sout) pval[(source,target)] = 0 for X in range(lowerSumLim,upperSumLim+1): pval[(source,target)] += hypergeom.pmf(X,sumWeights ,sout, sin, loc=0) #now apply the statistical correction for performing num edges tests for source,target in pval: if pval[(source,target)] < multivariateSignificanceCorrection: #reject the null hypothesis if True validatedDict[(source,target)] = pval[(source,target)] return validatedDict.keys()
def calc_hypergeometric(pathway_file, stats_file, go_list, threshold_go_list, pathway_ids_list): """ Calculates the hypergeometric probability mass function evaluated at k N = number of GOs in study n = number of GOs in given pathway m = number of disease-associated GOs k = number of disease-associated GOs in given pathway """ ## Initialize an empty pvalues DataFrame (really just an empty string) to ## return when there are errors pvalues_df = '' err = '' ## Convert TAB/CSV files into DataFrames pathway_df = csv_io.csv_to_data_frame(pathway_file, delimiter=',') stats_df = csv_io.csv_to_data_frame(stats_file) ## DEBUG #combined_full_df = stats_df.combineAdd(pathway_df.ix[list(go_list)]) #print combined_full_df.to_string() ## Check that user supplied pathway_ids exist in the pathway_df s = set(pathway_df.columns.tolist()) diff = [x for x in pathway_ids_list if x not in s] if len(diff) != 0: err = "ERROR: %s does not contain pathway_ids: %s" % ( pathway_file, diff) return pvalues_df, err ## Number of GOs in study N = len(go_list) ## Number of disease-associated GOs (this is a sub-set DataFrame of only ## those values in the stats_file whose chosen statistic was above the ## threshold) m = len(stats_df.ix[threshold_go_list]) ## Initialize empty dictionary hyper_dict = {'N': [], 'n': [], 'm': [], 'k': [], 'p_upper': [], 'p_lower': [], 'pvalue': []} ## Loop over pathway ids in the DataFrame for pw_id in pathway_ids_list: ## Number of disease-associated GOs in pathway pw_id #print pathway_df.ix[threshold_go_list][pw_id] k = int(pathway_df.ix[threshold_go_list][pw_id].sum()) ## Number of GOs in pathway pw_id n = int(pathway_df.ix[go_list][pw_id].sum()) ## Now calculate the p-values p_upper = float(sum(hypergeom.pmf(range(k,min(m,n)+1), N, m, n))) p_lower = float(1 - p_upper + hypergeom.pmf(k, N, m, n)) pvalue = min(p_upper, p_lower) ## Save p-values to dictionary hyper_dict['N'].append(N) hyper_dict['n'].append(n) hyper_dict['m'].append(m) hyper_dict['k'].append(k) hyper_dict['p_upper'].append(p_upper) hyper_dict['p_lower'].append(p_lower) hyper_dict['pvalue'].append(pvalue) ## DEBUG #test = sum(hypergeom.pmf(range(0,k+1), N, m, n)) #print "[%s] f(%s; %s, %s, %s) = %s vs. %s (%s)" % ( # pw_id, k, N, m, n, p_upper, p_lower, test) ## Format dictionary as a DataFrame pvalues_df = pd.DataFrame(hyper_dict, index=pathway_ids_list) ## Save DataFrame to output filename #pvalues_df.to_csv('path_pvals.dat', na_rep='NaN', sep='\t') return pvalues_df, err
import sa, statistics, random_assembly import networkx as nx from scipy import stats from scipy.stats import hypergeom import sys if __name__ == "__main__": curr_size = 6 assemblies = sa.load_all(dir=sys.argv[1]) intersect_num = [] for i in range(0, len(assemblies)-1): for id, pi in assemblies[i].pieces.items(): next_size = 6 intersect = 0 try: intersect = len(pi.neighbourhood.intersection(assemblies[i+1].pieces[id].neighbourhood)) intersect_num.append(intersect) except KeyError: pass for i in range(0, curr_size+1): print('Found', i, 'shared: ', sum(1 if x == i else 0 for x in intersect_num), round(100*sum(1 if x == i else 0 for x in intersect_num)/len(intersect_num), 2), \ '%; expected: ', round(100*hypergeom.pmf(i, 100, curr_size, curr_size), 2), '%')
def phyper(k, K, n, N): return hypergeom.pmf(k, N, n, K)