Exemple #1
0
def fisherexacttwotails(table):
    """
    Calculate the Fisher Exact Test with two sided, which is
    H0: p1 = p2 and Ha: p1 != p2

    Arguments:
        table {list} -- [description]
    """

    table_array = np.array(table)
    row0 = np.sum(table_array[0, :])
    row1 = np.sum(table_array[1, :])
    col0 = np.sum(table_array[:, 0])
    col1 = np.sum(table_array[:, 1])

    # calculate probabilities of all possible tables
    max_X = np.min([row0, col1])
    N = row0 + row1
    m = col0
    k = row0
    X = np.arrange(max_X + 1)
    possible_table_probab = hypergeom.pmf(X, N, m, k)
    observed_probab = hypergeom.pmf(table_array[0, 0], N, m, k)

    more_extreme = possible_table_probab <= observed_probab
    return np.sum(possible_table_probab * more_extreme)


#===== end file =====
    def pvalue(a_true, a_false, b_true, b_false):
        # Convert the a/b groups to study vs population.
        k = a_true
        n = a_false + a_true  # total in study.
        K = a_true + b_true
        N = K + a_false + b_false

        lm = max(0, n - (N - K))
        um = min(n, K)
        if lm == um:
            return PValues(1.0, 1.0, 1.0)

        epsilon = 1e-6
        cutoff = hypergeom.pmf(k, N, K, n)
        left_tail = 0
        right_tail = 0
        two_tail = 0
        for x in range(lm, um + 1):
            p = hypergeom.pmf(x, N, K, n)
            if x <= k:
                left_tail += p
            if x >= k:
                right_tail += p
            if p <= cutoff + epsilon:
                two_tail += p

        return PValues(min(left_tail, 1.0), min(right_tail, 1.0),
                       min(two_tail, 1.0))
Exemple #3
0
def two_tailed_fisher_exact(table):
    '''Calculate the p-value for a contingency table.
    
    Input Parameter:
            table : list
                Contains two elements, each element is a two-element list
                giving the row for the contingenct table.
                
    Return:
        The p-value for table.
    '''
    table_array = np.array(table)

    #- Totals by row and columns:
    total_row0 = np.sum(table_array[0, :])
    total_row1 = np.sum(table_array[1, :])
    total_col0 = np.sum(table_array[:, 0])
    #total_col1 = np.sum(table_array[:,1])

    possible_tables_probab = []
    max_X = np.min([total_row0, total_col0])
    N = total_row0 + total_row1
    m = total_col0
    k = total_row0

    X = np.arange(max_X + 1)
    possible_tables_probab = hypergeom.pmf(X, N, m, k)
    observed_probab = hypergeom.pmf(table_array[0, 0], N, m, k)

    #- Return sum of probabilities more extreme than the observed.
    more_extreme = possible_tables_probab <= observed_probab
    return np.sum(possible_tables_probab * more_extreme)
def validate_over_represented(g,edgeAttr,sig,correcting,isDirected):

	weights = [e[2][edgeAttr]for e in g.edges(data=True)]
	sumWeights = int(np.sum(weights))
	
	pmfHyper = {}
	pval = {}
	validatedDict = {}
	
	if correcting == True:
		multivariateSignificanceCorrection = sig/float(g.number_of_edges())#Bonferroni
	else:
		multivariateSignificanceCorrection = sig

	#find the probability of each weight for the hypergeometric null model
	for e in g.edges_iter(data=True):
		source = e[0]
		target = e[1]
		weight = e[2][edgeAttr]
		if isDirected:
			sout = g.out_degree(source,weight=edgeAttr)
			sin = g.in_degree(target,weight=edgeAttr)
		else:
			sout = g.degree(source,weight=edgeAttr)
			sin = g.degree(target,weight=edgeAttr)
		pmfHyper[(source,target)] = hypergeom.pmf(weight,sumWeights ,sout, sin, loc=0)
	
	#now find the p-value
	for e in g.edges_iter(data=True):
		
		source = e[0]
		target = e[1]
		weight = e[2][edgeAttr]
		
		#print source,target,weight
		
		pmfHyper[(source,target)] = hypergeom.pmf(weight,sumWeights ,sout, sin, loc=0)
		if isDirected:
			sout = g.out_degree(source,weight=edgeAttr)
			sin = g.in_degree(target,weight=edgeAttr)
		else:
			sout = g.degree(source,weight=edgeAttr)
			sin = g.degree(target,weight=edgeAttr)
		weight = e[2][edgeAttr]
		lowerSumLim = int(weight)
		upperSumLim = int(sin)
		if sout < sin:
			upperSumLim = int(sout)
		pval[(source,target)] = 0
		
		for X in range(lowerSumLim,upperSumLim+1):
			pval[(source,target)] += hypergeom.pmf(X,sumWeights ,sout, sin, loc=0)
		
	
	#now apply the statistical correction for performing num edges tests
	for source,target in pval:
		if pval[(source,target)] < multivariateSignificanceCorrection:
			validatedDict[(source,target)] = pval[(source,target)]
	return validatedDict.keys()		
def accumulative_hypergeometric(k, n, K, N):
    '''
    [k]: SUCCESS IN THE CLUSTER
    [n]: SIZE OF THE CLUSTER
    [K]: SUCCESS IN POPULATION
    [N]: SIZE OF THE POPULATION
    '''
    k, n, K, N = int(k), int(n), int(K), int(N)
    sf = hyp.sf(k, N, K, n)
    if sf < 1:
        return sf + hyp.pmf(k, N, K, n)
    else:
        return 1 - hyp.cdf(k, N, K, n) + hyp.pmf(k, N, K, n)
Exemple #6
0
    def compute_risk(self,
                     sub_audit: PairwiseAudit,
                     votes_for_winner: int = None,
                     current_round: int = None,
                     *args,
                     **kwargs) -> float:
        """Compute the risk level given current round size, votes for winner in sample, and subaudit.

        The risk level is computed using the normalized product of the prior and posterior
        distributions. The prior comes from compute_prior() and the posterior is the hypergeometric
        distribution of finding votes_for_winner from a sample of size current_round taken from a
        total size of contest_ballots. The risk is defined as the lower half of the distribution,
        i.e. the portion of the distribution associated with an incorrectly reported outcome.

        Args:
            sample (int): Votes found for reported winner in current round size.
            current_round(int): Current round size.
            sub_aduit (PairwiseAudit): Subaudit to generate risk value.

        Returns:
            float: Value for risk of given sample and round size.
        """

        posterior = np.array(
            hg.pmf(votes_for_winner, sub_audit.sub_contest.contest_ballots,
                   np.arange(sub_audit.sub_contest.contest_ballots + 1),
                   current_round))
        posterior = sub_audit.prior * posterior
        normalize = sum(posterior)
        if normalize > 0:
            posterior = posterior / normalize

        return sum(posterior[range(
            math.floor(sub_audit.sub_contest.contest_ballots / 2) + 1)])
Exemple #7
0
def calc_hg_enrichment_pval(mat, a, arm_a, aneu_type_a, b, arm_b, aneu_type_b):
    n_overlap = np.sum(
        np.logical_and(
            mat.loc[:, "{}{}".format(a, arm_a)].values == aneu_type_a,
            mat.loc[:, "{}{}".format(b, arm_b)].values == aneu_type_b))

    n_a = np.sum(
        np.logical_and(
            mat.loc[:, "{}{}".format(a, arm_a)].values == aneu_type_a,
            mat.loc[:, "{}{}".format(b, arm_b)].values != aneu_type_b))

    n_b = np.sum(
        np.logical_and(
            mat.loc[:, "{}{}".format(a, arm_a)].values != aneu_type_a,
            mat.loc[:, "{}{}".format(b, arm_b)].values == aneu_type_b))

    # pval=hypergeom.sf(n_overlap, mat.shape[0], n_overlap+n_a, n_overlap+n_b) \
    # + hypergeom.pmf(n_overlap, mat.shape[0], n_overlap+n_a, n_overlap+n_b)

    pval=hypergeom.sf(n_overlap, mat.shape[0], n_overlap+n_a, n_overlap+n_b) \
    + hypergeom.pmf(n_overlap, mat.shape[0], n_overlap+n_a, n_overlap+n_b) # n_overlap+n_a+n_overlap+n_b

    # tbl=[[n_overlap, n_b], [n_a, mat.shape[0]-(n_overlap+n_b+n_a)]]
    # pval_1=fisher_exact(tbl, 'greater')
    # if a==1 and arm_a=='p' and aneu_type_a==-1 and b==2 and  arm_b=='q' and aneu_type_b==-1:
    #     print (n_overlap, mat.shape[0], n_overlap+n_a, n_overlap+n_b)
    #     print pval, pval_1[1]

    return pval
def beta(u_i,i,n):
    #beta(i,u_i)=P(U_i=u_i)
    concat_range=chain(range(int(n/2)),range(int(n/2)+1,n+1))
    prob=0
    for j in concat_range:
        prob+=hypergeom.pmf(u_i,n,j,i)
    return prob/n
 def plot(self, x, n, p):
     pmf = hypergeom.pmf(x, n, p)
     plt.plot(x, pmf, 'o-')
     plt.title('HyperGeometric: n=%i , p=%.2f' % (n, p), fontsize='value')
     plt.xlabel('Number of successes')
     plt.ylable('Probability of Successes', fontsize='value')
     plt.show()
def test_alg1_alg2_accuracy_difference():
    a = 20
    bvals = np.arange(50, 100, dtype=np.int64)
    tol = 1e-12

    k = bvals.size
    truth = np.zeros(k, dtype=np.float64)
    pval1 = np.zeros(k, dtype=np.float64)
    pval2 = np.zeros(k, dtype=np.float64)
    for i, b in enumerate(bvals):
        b = int(b)
        N = a + b
        K = a
        X = 1
        L = N
        n = a
        k = K
        truth[i] = hypergeom.pmf(k, N, K, n)

        v = np.r_[np.ones(a, dtype=np.uint8), np.zeros(b, dtype=np.uint8)]
        stat, n_star = mhg.get_xlmhg_stat(v, X, L)
        p1 = mhg.get_xlmhg_pval1(N, K, X, L, stat)
        p2 = mhg.get_xlmhg_pval2(N, K, X, L, stat)
        pval1[i] = p1
        pval2[i] = p2

    assert np.all(~np.isnan(pval2))
    assert np.any(np.isnan(pval1))
    for i in range(bvals.size):
        assert mhg.is_equal(pval2[i], truth[i], tol=tol)
Exemple #11
0
def test_nhypergeom_pmf():
    # test with hypergeom
    M, n, r = 45, 13, 8
    k = 6
    NHG = nhypergeom.pmf(k, M, n, r)
    HG = hypergeom.pmf(k, M, n, k+r-1) * (M - n - (r-1)) / (M - (k+r-1))
    assert_allclose(HG, NHG, rtol=1e-10)
def calcPValues(n, N1, params, P_actu):

    return [
        hypergeom.sf(P_actu[i], N1, params[i], n) +
        0.5 * hypergeom.pmf(P_actu[i], N1, params[i], n)
        for i in range(len(P_actu))
    ]
def probability_same(assignment):
    D = assignment.shape[0]
    sum_a = 0
    for i in range(assignment.shape[1]):
        N = sum(assignment[:, i])
        if N > 0:
            sum_a += hypergeom.pmf(2, D, 2, N)

    sum_b = 0
    for i in range(assignment.shape[1]):
        for j in range(i + 1, assignment.shape[1]):
            N = len(intersection(assignment, i, assignment, j))
            if N > 0:
                sum_b += hypergeom.pmf(2, D, 2, N)

    return sum_a - sum_b
Exemple #14
0
 def cHgPvl(x,M,n,N):
     """
     x=randVar
     M=popSize
     n=totalSuccesses
     N=samplSize
     """
     return 1-hypergeom.cdf(x,M,n,N)+hypergeom.pmf(x,M,n,N)
Exemple #15
0
def test_hypergeom():
    """
    Compare mine with scipy.stats.
    :return:
    """
    from scipy.stats import hypergeom
    import time
    # M = populacja, N = liczba prób, n = liczba sukcesów, k = punkt
    pop = 100
    trials = 50
    successes = 30
    failures = pop - successes
    kmin = max(0, trials - failures)
    kmax = min(trials, successes)
    ss_list = [
        hypergeom.pmf(k=k, N=trials, M=pop, n=successes)
        for k in range(kmin, kmax + 1)
    ]
    my_list = list(hypergeom_pmf_iterator(N=pop, k=successes, n=trials))
    expect = trials * float(successes) / pop
    iterations = 100
    ss_results = [0 for _ in range(iterations)]
    ss_start_time = time.time()
    for iter in range(iterations):
        ss_expect = sum(i * hypergeom.pmf(k=i, N=trials, M=pop, n=successes)
                        for i in range(kmin, kmax + 1))
        ss_results[iter] = ss_expect
    ss_end_time = time.time()
    ss_time = ss_end_time - ss_start_time
    my_results = [0 for _ in range(iterations)]
    my_start_time = time.time()
    for iter in range(iterations):
        my_expect = sum((i + kmin) * pr for i, pr in enumerate(
            hypergeom_pmf_iterator(N=pop, k=successes, n=trials)))
        my_results[iter] = my_expect
    my_end_time = time.time()
    my_time = my_end_time - my_start_time
    print "Average scipy error:"
    print sum(abs(r - expect) for r in ss_results) / iterations
    print "Average my implementation error:"
    print sum(abs(r - expect) for r in my_results) / iterations
    print "Scipy execution time:"
    print ss_time
    print "My implementation execution time:"
    print my_time
Exemple #16
0
    def current_dist_null(self):
        """Update distribution_null for current round."""

        #print("len(self.rounds) = " + str(len(self.rounds)))
        if len(self.rounds) == 1:
            round_draw = self.rounds[0]
        else:
            round_draw = self.rounds[-1] - self.rounds[-2]

        # Compute the underlying number of winner votes under the null 
        # (Note that if null_margin is 0, then this simplifies to a tie)
        Nw_exact = (self.contest.contest_ballots + self.null_margin) / 2
        Nw = math.floor(Nw_exact)
        #print("x*: "+str(Nw))

        # Distribution updating is dependent on sampling with or without replacement
        if self.replacement:
            distribution_round_draw = binom.pmf(range(0, round_draw + 1), round_draw, Nw_exact / self.contest.contest_ballots)
            # Compute convolution to get new distribution (except 1st round)
            if len(self.rounds) == 1:
                self.distribution_null = distribution_round_draw
            else:
                self.distribution_null = fftconvolve(self.distribution_null, distribution_round_draw)
        else:
            if len(self.rounds) == 1:
                # Simply compute hypergeometric for 1st round distribution
                self.distribution_null = hypergeom.pmf(np.arange(round_draw + 1), self.contest.contest_ballots, Nw,
                                                       round_draw)
            else:
                distribution_round_draw = [0 for i in range(self.rounds[-1] + 1)]
                # Get relevant interval of previous round distribution
                interval = self.__get_interval(self.distribution_null)
                # For every possible number of winner ballots in previous rounds
                # and every possibility in the current round
                # compute probability of their simultaneity
                for prev_round_possibility in range(interval[0], interval[1] + 1):
                    unsampled_contest_ballots = self.contest.contest_ballots - self.rounds[-2]
                    unsampled_winner_ballots = Nw - prev_round_possibility

                    curr_round_draw = hypergeom.pmf(np.arange(round_draw + 1), unsampled_contest_ballots, unsampled_winner_ballots,
                                                    round_draw)
                    for curr_round_possibility in range(round_draw + 1):
                        component_prob = self.distribution_null[prev_round_possibility] * curr_round_draw[curr_round_possibility]
                        distribution_round_draw[prev_round_possibility + curr_round_possibility] += component_prob
                self.distribution_null = distribution_round_draw
Exemple #17
0
 def test_nch_hypergeom(self, dist_name):
     # Both noncentral hypergeometric distributions reduce to the
     # hypergeometric distribution when odds = 1
     dists = {'nchypergeom_fisher': nchypergeom_fisher,
              'nchypergeom_wallenius': nchypergeom_wallenius}
     dist = dists[dist_name]
     x, N, m1, n = self.x, self.N, self.m1, self.n
     assert_allclose(dist.pmf(x, N, m1, n, odds=1),
                     hypergeom.pmf(x, N, m1, n))
def main():
    args = parser.parse_args()
    x,n,M,N = args.x,args.n,args.M,args.N
    verify(x,n,M,N)
    total_probablity = 0
    for i in range(x,min(n,M)+1,1):
        probablity = hypergeom.pmf(i,N,M,n)
        total_probablity += probablity
    print total_probablity
 def hg(cardname, quantity, percent):
     try:
         actualsuccesses = carddict[cardname]
         decksize = 60
         successdraws = quantity
         cardsdrawn = sum(carddict.values())
         return hypergeom.pmf(k=actualsuccesses, M=decksize, n=successdraws, N=cardsdrawn) * percent
     except KeyError:
         return 0
Exemple #20
0
def fisher_exact_test_pval(a, x, n, N):
    if int(x) == 0 or int(x) == N:
        pval = 1
    else:
        prob_mass = hypergeom.pmf(np.arange(N + 1), N, n, int(x))
        left_tail = prob_mass[:(a + 1)].sum()
        right_tail = prob_mass[a:].sum()
        pval = np.minimum(left_tail, right_tail)
    return pval
def epsilon(u_i,i,n):
    #epsilon(u_i,i,n)=P(X_i+1 = 1 | U_i=u_i)
    prob=0
    b = beta(u_i,i,n)
    concat_range=chain(range(int(n/2)),range(int(n/2)+1,n+1))
    for k in concat_range:
        prob+=gamma(k,u_i,i,n)*hypergeom.pmf(u_i,n,k,i)
    prob=prob/(n*b)        
    return prob
def functional_enrichment(dicbp2,
                          N,
                          geneset_toenrich,
                          type_correction,
                          pvalue_threshold=0.05):
    """
    Calculate the functional enrichment using the method described by Carlota.
    Parameters:
        @dicbp2:                A dictionary containing the associations of functions to their corresponding genes
        @N:                     The total number of genes
        @geneset_toenrich:      The gene set to check the enrichment
        @type_correction:       The multiple test p-value correction (fdr_bh / bonferroni)
        @pvalue_threshold:      P-value threshold of the multiple test p-value correction
    """
    pvals = {}
    genes_enriched = {}
    go_enriched = False
    k = len(geneset_toenrich)
    terms_l = []
    test_passed_f = False
    term_to_values = {}
    for term in dicbp2.keys():
        m = len(dicbp2[term])
        xl = [y for y in dicbp2[term] if y in geneset_toenrich]
        x = len(xl)
        if x != 0:
            go_enriched = True
            xlist = []
            for i in range(x, m + 1):
                xlist.append(i)
            # calculation of the hypervalue
            dhypervalue = hypergeom.pmf(xlist, N, m, k)
            # threshold of enrichment
            pvals[term] = sum(dhypervalue)
            genes_enriched[term] = [x,
                                    m]  # Quim: addition to get genes enriched
    if go_enriched:
        pvals_values = list(pvals.values())
        terms = list(pvals.keys())
        pvals_corrected = multipletests(pvals_values,
                                        alpha=0.05,
                                        method=type_correction,
                                        is_sorted=False,
                                        returnsorted=False)
        for i in range(0, len(terms)):
            if list(pvals_corrected[1])[i] < pvalue_threshold:
                # Quim: addition to get the p-values and genes enriched
                pval = pvals_values[i]
                pval_corrected = list(pvals_corrected[1])[i]
                term = terms[i]
                x, m = genes_enriched[term]
                term_to_values[term] = [pval, pval_corrected, x, m]
                ####################################
                test_passed_f = True
                terms_l.append(terms[i])
    return test_passed_f, terms_l, term_to_values
def wang_cpci(n, N, M, lciw, uciw):
    """Wang method: the coverage probability function."""
    kk = list(range(len(M)))
    for ii in kk:
        indp = list(range(n + 1))
        for j in range(n + 1):
            indp[j] = wang_ind(M[ii], lciw[j], uciw[j])\
                * hypergeom.pmf(j, N, M[ii], n)
        kk[ii] = sum(indp)
    return kk
Exemple #24
0
def calcH(N, Z):
    # Input: N group size, Z population size
    # Output: H[k,K] hypergeometric function (k individuals in group, K individuals in population)
    import numpy as np
    from scipy.stats import hypergeom
    H = np.zeros((N + 1, Z + 1))
    for K in range(0, Z + 1):
        for k in range(0, N + 1):
            H[k, K] = hypergeom.pmf(k, Z, K, N)
    return H
def geometric_test(G, list1, list2, genes):
    H = G.subgraph(genes)
    overlap = 119
    M = len(list(G.subgraph(list1)))
    n = len(list(G.subgraph(list2)))
    N = 10261

    print overlap, M, n, N
    pval = hypergeom.pmf(overlap - 1, M, n, N)
    print pval
Exemple #26
0
def matrix_double_sample_selection(n, N, s=0):
    mtx = np.zeros((2*n + 1, n + 1))
    # cache = [[np.full((i+1, j+1), np.nan) for j in range(1, n+1)] for i in range(1, 2*n+1)]
    cache =  np.full((2*n+1, 2*n+1, 2*n+1, 2*n+1), np.nan)
    for ip in range(0, 2*n + 1):
        for io in range(0, n + 1):
            for nc in range(0, 2*n+1):
                for ic in range(0, nc+1):
                    mtx[ip, io] += Qs(io, n, ic, nc, N, s, cache) * hypergeom.pmf(ic, 2*n, ip, nc)
    return mtx, cache
Exemple #27
0
def matrix(no=5, s=1 / 1000, N=1000, max_t=1):
    mtx = np.zeros((no + 1, no + 1)) # mtx = Q in text (i_o,i_p)
    cache_P0 = np.full((no+1, no+1, no+1, no+1), np.nan) # p0 = T_0 in text; arguments (i_o,n_o,i_p,n_p)
    cache_Pf = np.full((no+1, no+1, no+1, no+1, max_t+1), np.nan) # pf = T_r (i_o,n_o,i_p,n_p,r)
    for nc in range(0, no + 1): # n_c is n_p in text
        for ic in range(0, nc + 1):
            # vectorized over ip
            p = hypergeom.pmf(ic, no, np.arange(0, no + 1), nc)
            for io in range(0, no + 1):
                mtx[:, io] += P0(io, no, ic, nc, s, N, max_t, cache_P0, cache_Pf) * p
    return mtx, cache_P0, cache_Pf
Exemple #28
0
def matrix_selection(n, N, s=0):
    mtx = np.zeros((n + 1, n + 1))
    cache = np.full((n+1, n+1, n+1, n+1), np.nan) # this is messier (we allocate way more than we need, but it's easier for numba)
    # cache = [[np.full((i+1, j+1), np.nan) for j in range(1, n+1)] for i in range(1, n+1)] # this is the minimal size allocation possible
    for nc in range(0, n + 1):
        for ic in range(0, nc + 1):
            # vectorized over ip
            p = hypergeom.pmf(ic, n, np.arange(0, n+1), nc)
            for io in range(0, n + 1):
                mtx[:, io] += Qs(io, n, ic, nc, N, s, cache) * p
    return mtx, cache
def probability_class_cluster(clazz, klust):
    D = klust.shape[0]
    sum_a = 0
    for i in range(klust.shape[1]):
        for j in range(clazz.shape[1]):
            N = len(intersection(klust, i, clazz, j))
            if N > 0:
                sum_a += hypergeom.pmf(2, D, 2, N)

    sum_b = 0
    for i in range(klust.shape[1]):
        for j in range(clazz.shape[1]):
            for ip in range(i + 1, klust.shape[1]):
                for jp in range(j + 1, clazz.shape[1]):
                    s_ij = intersection(klust, i, clazz, j)
                    s_ipjp = intersection(klust, ip, clazz, jp)
                    N = len(numpy.intersect1d(s_ij, s_ipjp))
                    if N > 0:
                        sum_b += hypergeom.pmf(2, D, 2, N)

    return sum_a - sum_b
Exemple #30
0
 def cpci(M):
     kk = np.arange(len(M)).astype(np.float64)
     for i in np.arange(len(M)):
         xx = np.arange(n + 1)
         indp = xx.astype(np.float64)
         uu = 0
         while (uu < n + 0.5):
             indp[uu] = (ind(M[i], lciw[uu], uciw[uu]) *
                         hypergeom.pmf(uu, N, M[i], n))
             uu += 1
         kk[i] = sum(indp)
     return kk
Exemple #31
0
def matrix(no=5, s=1 / 1000, N=1000, max_t=1):
    mtx = np.zeros((no + 1, no + 1))
    cache_P0 = np.full((no + 1, no + 1, no + 1, no + 1), np.nan)
    cache_Pf = np.full((no + 1, no + 1, no + 1, no + 1, max_t + 1), np.nan)
    for nc in range(0, no + 1):
        for ic in range(0, nc + 1):
            # vectorized over ip
            p = hypergeom.pmf(ic, no, np.arange(0, no + 1), nc)
            for io in range(0, no + 1):
                mtx[:, io] += P0(io, no, ic, nc, s, N, max_t, cache_P0,
                                 cache_Pf) * p
    return mtx, cache_P0, cache_Pf
def checktheory(thres, n, ne, p, te, s):
    # calculate failure probability for a certain number of ones in se
    pfaildict = {}
    for se in range(0, n + 1):
        tmp = [binom.sf((thres + i + se) / 2, se, p=0.5) * s[i] for i in s]
        tmp2 = [binom.sf((thres + 1 + i + se) / 2, se, p=0.5) * s[i]
                for i in s]
        pfail = 1.5 * sum(tmp) + 0.5 * sum(tmp2)
        pfaildict[se] = pfail

    # set everything to zero
    fail = 0
    fail2 = {}
    for te1 in te:
        fail2[te1] = 0

    # loop over all norm values
    for l1, l2 in tqdm(itertools.combinations_with_replacement(range(0, n + 1), 2), leave=False, total=n * (n + 1) / 2):
        # probability of a certain norm
        pl1 = binom.pmf(l1, n=n, p=p)
        pl2 = binom.pmf(l2, n=n, p=p)
        pl = pl1 * pl2
        if l1 != l2:
            pl *= 2

        # skip if probability is too small
        if pl < 2**-200:
            continue

        # calculate the probability of a failure
        failtmp = 0
        # loop over all possible number of nonzero elements in se
        for se1 in range(max(0, l1 + l2 - n), min(l1, l2) + 1):
            # probability of number of nonzero elements in se
            pse = hypergeom.pmf(k=se1, M=n, n=l1, N=l2)
            # probability of failure for a certain se1
            pfail = pfaildict[se1]
            # weighted average share
            failtmp += pse * pfail

        # for new model, take error correction into account
        fail += pl * failtmp
        for te1 in te:
            fail2[te1] += pl * LACprob(failtmp, ne, te1)

    new = []
    old = []
    for te1 in te:
        # for old model, take error correction into account
        old.append(LACprob(fail, ne, te1))
        new.append(fail2[te1])

    return new, old
def rho(i,u_i,n):
    #=number of boxes
    #Rho(i,x) = P(u_12 >= 7 | U_i=u_i) = P(red majority given U_i=u_i)
    
    #numerator=P(u_12 >= 7 and u_i=x) = P(majority red and u_i=x)
    nume=0
    for j in range(int((n/2))+1,n+1):
        #P(u_i=x|u_n=j)=hypergeom.pmf(x,n,j,i)
        nume+=hypergeom.pmf(u_i,n,j,i)
    nume=nume/n    
    #denumerator=P(U_i=u_i)
    denume=beta(u_i,i,n)
    return nume/denume
Exemple #34
0
def visualize(hgt_preprocessing_file_name):
    HGTs = np.load(os.path.join(BASE_OUTPUT_DIR, hgt_preprocessing_file_name))
    HGTs[HGTs == 0] = -1
    B, N = np.shape(HGTs)
    B -= 1
    N -= 1
    mHGT = 0.0002
    left_tails = [(hypergeom.sf(i, N, B, i) + hypergeom.pmf(i, N, B, i))
                  for i in range(0, B + 1)]  #
    top_tails = [(hypergeom.sf(B, N, B, i) + hypergeom.pmf(B, N, B, i))
                 for i in range(0, N + 1)]  #

    for i, cur in enumerate(left_tails):
        if cur <= mHGT:
            left_edge = (i, i)
            break

    for i, cur in enumerate(top_tails):
        if cur >= mHGT:
            top_edge = (i - 1, B)
            break
    slope = (left_edge[1] - top_edge[1]) / ((left_edge[0] - top_edge[0]) * 1.0)
    constant = top_edge[1] - slope * top_edge[0]

    cmap = colors.ListedColormap(["red", 'gray', 'skyblue', "red"])
    bounds = [-1, -0.1, mHGT, 1, 1]
    norm = colors.BoundaryNorm(bounds, cmap.N)

    fig, ax = plt.subplots()
    ax.imshow(HGTs, cmap=cmap, norm=norm)

    # draw gridlines
    ax.grid(which='minor', axis='both', linestyle='-', color='k', linewidth=1)
    ax.invert_yaxis()
    # ax.set_xticks(np.arange(-.5, 10, 1));
    ax.set_yticks(np.arange(0, 101, 8000))
    plt.plot([left_edge[0], top_edge[0]], [left_edge[1], top_edge[1]], "green")
    plt.show()
    x = 1
Exemple #35
0
def compute_pmatrix(X):
    """Computes the p-matrix of input binary matrix `X` using 
    hypergeometric pmf. See Lima-Mendez 2008.
    NOTE: Much slower than MATLAB. Investigate"""
    (N, n) = X.shape
    P = np.zeros((N, N))
    
    for i in range(N):
        print "%d / %d" % (i + 1, N)
        a = sum(X[i, :])
        for j in range(i + 1, N):
            b = sum(X[j, :])
            c = np.dot(X[i, :], X[j, :])
            C = min(a, b)
            if C is not 0:
                P[i, j] = sum(hypergeom.pmf(range(c, C + 1), n, a, b))
            else:
                # C==0 will yield a nan in hypergeom formula
                P[i, j] = 1
    P = P + P.T
    return P
Exemple #36
0
def compute_pval(ra, rb):
    from scipy.stats import hypergeom
    """
    Compute the pval between two binary gene
    profiles. pval is defined using hypergeometric
    formula.

    Input:
        ra: series
        rb: series
    """
    n = len(ra)
    a = sum(ra)
    b = sum(rb)
    c = np.dot(ra, rb)
    C = min(a ,b)
    if C is not 0:
        pval = sum(hypergeom.pmf(range(c, C+1), n, a, b))
    else:
        pval = 1

    return pval
Exemple #37
0
def plot_cell_enrichments(
    ds,
    f=None,
    enrichments=None,
    ax=None,
    title=None,
):
    LOGGER.info("Plotting cell type enrichments")

    if enrichments is None:
        enrichments = brs.enrichments.get_enrichments(
            list(ds.species)[0],
        )

    if f is None:
        f = {'p': .05, 'asym_fold': 1.25}

    if isinstance(f, dict):
        f = [f]

    if ax is None:
        _, ax = plt.subplots(figsize=(4, 3))

    cell_prots = {
        cell: [key for key, val in enrichments.items() if val == cell]
        for cell in brs.CELL_TYPES
    }

    display_name = {
        'Myelinating Oligodendrocytes': 'Oligodendrocytes',
    } if sum([
        'Oligo' in cell and len(cell_prots.get(cell, [])) > 0
        for cell in brs.CELL_TYPES
    ]) else {}

    vals = []

    ds = ds.filter(
        protein=set(j for i in cell_prots.values() for j in i),
        fn=lambda x: len(x['Proteins']) < 2,
    )
    hatches = [
        "",
        "//",
        "o",
        "x",
        ".",
    ]

    for cell in brs.CELL_TYPES:
        for ind, fil in enumerate(f):
            dc = ds.filter(protein=set(cell_prots[cell]))

            fore_hits = dc.filter(fil).shape[0]
            fore_size = dc.shape[0]

            back_hits = ds.filter(fil).shape[0]
            back_size = ds.shape[0]

            if fore_size < 1 or back_size < 1:
                continue

            val = (
                1 -
                hypergeom.cdf(fore_hits, back_size, back_hits, fore_size) +
                hypergeom.pmf(fore_hits, back_size, back_hits, fore_size)
            )
            vals.append(
                pd.Series(
                    OrderedDict([
                        ('cell', display_name.get(cell, cell)),
                        ('fore hits', fore_hits),
                        ('fore size', fore_size),
                        ('back hits', back_hits),
                        ('back size', back_size),
                        ('p-value', val),
                        ('-log10 p-value', -np.log10(val)),
                        ('color', brs.CELL_COLORS[cell]),
                        ('hatch', hatches[ind % len(hatches)]),
                        ('hue', format_title(f=fil)),
                    ])
                )
            )

    df = pd.DataFrame(vals)

    ax = sns.barplot(
        data=df,
        y='cell',
        x='-log10 p-value',
        hue='hue',
        ax=ax,
    )

    ax.axvline(-np.log10(.01), color='k', linestyle=':')
    ax.legend(
        handles=[
            mpatches.Patch(
                facecolor='w',
                edgecolor='k',
                hatch=i,
                label=df['hue'].iloc[ind],
            )
            for ind, i in enumerate(hatches[:len(f)])
        ]
    )

    for hatch, color, p in zip(
        df['hatch'],
        df['color'],
        sorted(ax.patches, key=lambda x: x.xy[1]),
    ):
        p.set_hatch(hatch)
        p.set_facecolor(color)
        p.set_edgecolor('k')

    if title:
        ax.set_title(title)

    ax.set_ylabel('')
    ax.set_xlabel('p-value')
    ax.set_xticklabels(['{:.3}'.format(10 ** -i) for i in ax.get_xticks()])

    return ax.get_figure(), ax
def calculate_downsampled_estimate(i,n,m):
    downsampled_estimate = numpy.zeros(m+1)
    js = numpy.arange(max([0,m-(n-i)]),min([i,m])+1)
    downsampled_estimate[js] += hypergeom.pmf(js,n,i,m)
    return downsampled_estimate[1:-1]
Exemple #39
0
	
	write.writelines('#Human phenotype: ' + humanPhenotypeName)
	write.writelines('\n#\tAnalysis run ' + str(today.year) + '-' + str(today.month) + '-' + str(today.day))
	
	
	write.writelines('\n#\t' +  str(len(set(humanMicrocephalyGenes))) + ' genes of ' + str(len(humanToHom)) + ' give rise to this phenotype')
	
	
	write.writelines('\n#Zebrafish phenotype: ' + zfishAnatomy[0]+ ' ' + zfishPhenotype[0])
	p=1
	while p<len(zfishAnatomy):
		 write.writelines(',' + zfishAnatomy[p]+ ' ' + zfishPhenotype[p])
		 p+=1
	write.writelines('\n#\t' + str(numFishMicrocephaly) + ' of ' + str(numFishGenes) + ' fish genes match phenotype (' + str(100.*numFishMicrocephaly/numFishGenes) + '%)')
	write.writelines('\n#\t' + str(fishHasMicrocephaly) + ' of ' + str(geneInFish) + ' fish with human genes match phenotype (' + str(100.*fishHasMicrocephaly/geneInFish) + '%)') 
	fishPval=hypergeom.pmf(fishHasMicrocephaly,numFishGenes,numFishMicrocephaly,geneInFish)
	write.writelines('\n#\tp-value = ' + str(fishPval) + ' by hypergeometic test') 
	
	write.writelines('\n#Mouse phenotype: ' + mousePhenotypesReference[0])
	for pheno in mousePhenotypesReference[1:]:
		write.writelines(',' + pheno)
	write.writelines('\n#\t' + str(numMouseMicrocephaly) + ' of ' + str(numMouseGenes) + ' mice genes match phenotype (' + str(100.*numMouseMicrocephaly/numMouseGenes) + '%)')
	write.writelines('\n#\t' + str(mouseHasMicrocephaly) + ' of ' + str(geneInMouse) + ' mice with human genes match phenotype (' + str(100.*mouseHasMicrocephaly/geneInMouse) + '%)') 
	mousePval=hypergeom.pmf(mouseHasMicrocephaly,numMouseGenes,numMouseMicrocephaly,geneInMouse)
	write.writelines('\n#\tp-value = ' + str(mousePval) + ' by hypergeometic test') 
	
	write.writelines(toWrite)
	
	
	write.close()
	if fishHasMicrocephaly>0:
Exemple #40
0
def fisher_exact(c) :
    """Performs a Fisher exact test on a 2x2 contingency table.

    Parameters
    ----------
    c : array_like of ints
        A 2x2 contingency table.

    Returns
    -------
    oddsratio : float
        This is prior odds ratio and not a posterior estimate.
    p-value : float
        P-value for 2-sided hypothesis of independence.


    Examples
    --------
    >>> fisher_exact([[100, 2], [1000, 5]])
    (0.25, 0.13007593634330314)
    """

    c = np.asarray(c, dtype=np.int64)  # int32 is not enough for the algorithm
    odssratio = c[0,0] * c[1,1] / float(c[1,0] * c[0,1]) \
                            if (c[1,0] > 0 and c[0,1] > 0) else np.inf
    n1 = c[0,0] + c[0,1]
    n2 = c[1,0] + c[1,1]
    n  = c[0,0] + c[1,0]

    mode = int(float((n + 1) * (n1 + 1)) / (n1 + n2 + 2))
    pexact = hypergeom.pmf(c[0,0], n1 + n2, n1, n)
    pmode = hypergeom.pmf(mode, n1 + n2, n1, n)

    epsilon = 1 - 1e-4
    if float(np.abs(pexact - pmode)) / np.abs(np.max(pexact, pmode)) <= 1 - epsilon:
        return odssratio, 1

    elif c[0,0] < mode:
        plower = hypergeom.cdf(c[0,0], n1 + n2, n1, n)

        if hypergeom.pmf(n, n1 + n2, n1, n) > pexact / epsilon:
            return odssratio, plower

        # Binary search for where to begin upper half.
        min = mode
        max = n
        guess = -1
        while max - min > 1:
            guess = max if max == min + 1 and guess == min else (max + min) / 2

            pguess = hypergeom.pmf(guess, n1 + n2, n1, n)
            if pguess <= pexact and hypergeom.pmf(guess - 1, n1 + n2, n1, n) > pexact:
                break
            elif pguess < pexact:
                max = guess
            else:
                min = guess

        if guess == -1:
            guess = min

        while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon:
            guess -= 1

        while hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon:
            guess += 1

        p = plower + hypergeom.sf(guess - 1, n1 + n2, n1, n)
        if p > 1.0:
            p = 1.0
        return odssratio, p
    else:
        pupper = hypergeom.sf(c[0,0] - 1, n1 + n2, n1, n)
        if hypergeom.pmf(0, n1 + n2, n1, n) > pexact / epsilon:
            return odssratio, pupper

        # Binary search for where to begin lower half.
        min = 0
        max = mode
        guess = -1
        while max - min > 1:
            guess = max if max == min + 1 and guess == min else (max + min) / 2
            pguess = hypergeom.pmf(guess, n1 + n2, n1, n)
            if pguess <= pexact and hypergeom.pmf(guess + 1, n1 + n2, n1, n) > pexact:
                break
            elif pguess <= pexact:
                min = guess
            else:
                max = guess

        if guess == -1:
            guess = min

        while hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon:
            guess += 1

        while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon:
            guess -= 1

        p = pupper + hypergeom.cdf(guess, n1 + n2, n1, n)
        if p > 1.0:
            p = 1.0
        return odssratio, p
Exemple #41
0
def _motif_sig(fore_hits, fore_size, back_hits, back_size):
    return (
        1 -
        hypergeom.cdf(fore_hits, back_size, back_hits, fore_size) +
        hypergeom.pmf(fore_hits, back_size, back_hits, fore_size)
    )
def validate_over_under_represented_fast(g,edgeAttr,sig,correct,isDirected):

	weights = [e[2][edgeAttr] for e in g.edges(data=True)]
	sumWeights = int(np.sum(weights))
	
	pmfHyper = {}
	pvalOver = {}
	pvalUnder = {}
	validatedOver = {}
	validatedUnder = {}
	e = g.number_of_edges()
	nB = len([u for u in g if g.in_degree(u) > 0])
	nL = len([u for u in g if g.out_degree(u) > 0])
	nLB = len([u for u in g if g.out_degree(u) > 0 and g.in_degree(u) > 0])
	T = e + nB*nL - nLB
	
	print "e: " + str(e)
	print "nB: " + str(nB)
	print "nL: " + str(nL)
	print "nLB: " + str(nLB)
	print "T: " + str(T)

	if correct == True:
		multivariateSignificanceCorrection = sig/float(T)#Bonferroni
	else:
		multivariateSignificanceCorrection = sig

	#find the probability of each weight for the hypergeometric null model
	for source,nbrsdict in g.adjacency_iter():
		for target,keydict in nbrsdict.iteritems():
			for key,eattr in keydict.iteritems():
				
				if key == edgeAttr:
					
					if isDirected:
						sout = g.out_degree(source,weight=edgeAttr)
						sin = g.in_degree(target,weight=edgeAttr)
					else:
						sout = g.degree(source,weight=edgeAttr)
						sin = g.degree(target,weight=edgeAttr)
				
					pmfHyper[(source,target)] = hypergeom.pmf(eattr,sumWeights ,sout, sin, loc=0)
					

	#now find the p-value
	for source,nbrsdict in g.adjacency_iter():
		for target,keydict in nbrsdict.iteritems():
			for key,eattr in keydict.iteritems():
			
				if key == edgeAttr:
					
					if isDirected:
						sout = g.out_degree(source,weight=edgeAttr)
						sin = g.in_degree(target,weight=edgeAttr)
					else:
						sout = g.degree(source,weight=edgeAttr)
						sin = g.degree(target,weight=edgeAttr)

					#do the over validation
					lowerSumLim = int(eattr)
					upperSumLim = int(sin)
					if sout < sin:
						upperSumLim = int(sout)
				
					pvalOver[(source,target)] = 0
		
					for X in range(lowerSumLim,upperSumLim+1):
						pvalOver[(source,target)] += hypergeom.pmf(X,sumWeights ,sout, sin, loc=0)
					
					#do the under validation
					lowerSumLim = 0
					upperSumLim = int(eattr)
				
					pvalUnder[(source,target)] = 0
		
					for X in range(lowerSumLim,upperSumLim+1):
						pvalUnder[(source,target)] += hypergeom.pmf(X,sumWeights ,sout, sin, loc=0)
		
	#now apply the statistical correction for performing num edges tests
	for source,target in pvalOver:
		if pvalOver[(source,target)] < multivariateSignificanceCorrection: #reject the null hypothesis if True
			validatedOver[(source,target)] = pvalOver[(source,target)]
		if pvalUnder[(source,target)] < multivariateSignificanceCorrection: #reject the null hypothesis if True
			validatedUnder[(source,target)] = pvalUnder[(source,target)]
	
	#sorted_pvalOver = sorted(pvalOver.iteritems(), key=operator.itemgetter(1))

	return (validatedOver.keys(),validatedUnder.keys())
def validate_over_represented_fast(g,edgeAttr,sig,correct,isDirected):

	weights = [e[2][edgeAttr] for e in g.edges(data=True)]
	sumWeights = int(np.sum(weights))
	
	pmfHyper = {}
	pval = {}
	validatedDict = {}
	
	if correct == True:
		multivariateSignificanceCorrection = sig/float(g.number_of_edges())#Bonferroni
	else:
		multivariateSignificanceCorrection = sig

	#find the probability of each weight for the hypergeometric null model
	for source,nbrsdict in g.adjacency_iter():
		for target,keydict in nbrsdict.iteritems():
			for key,eattr in keydict.iteritems():
				
				if key == edgeAttr:
					
					if isDirected:
						sout = g.out_degree(source,weight=edgeAttr)
						sin = g.in_degree(target,weight=edgeAttr)
					else:
						sout = g.degree(source,weight=edgeAttr)
						sin = g.degree(target,weight=edgeAttr)
				
					pmfHyper[(source,target)] = hypergeom.pmf(eattr,sumWeights ,sout, sin, loc=0)
					

	#now find the p-value
	for source,nbrsdict in g.adjacency_iter():
		for target,keydict in nbrsdict.iteritems():
			for key,eattr in keydict.iteritems():
			
				if key == edgeAttr:
					
					if isDirected:
						sout = g.out_degree(source,weight=edgeAttr)
						sin = g.in_degree(target,weight=edgeAttr)
					else:
						sout = g.degree(source,weight=edgeAttr)
						sin = g.degree(target,weight=edgeAttr)

					
					lowerSumLim = int(eattr)
					upperSumLim = int(sin)
					if sout < sin:
						upperSumLim = int(sout)
				
					pval[(source,target)] = 0
		
					for X in range(lowerSumLim,upperSumLim+1):
						pval[(source,target)] += hypergeom.pmf(X,sumWeights ,sout, sin, loc=0)
		
	#now apply the statistical correction for performing num edges tests
	for source,target in pval:
		if pval[(source,target)] < multivariateSignificanceCorrection: #reject the null hypothesis if True
			validatedDict[(source,target)] = pval[(source,target)]
	return validatedDict.keys()		
def calc_hypergeometric(pathway_file, stats_file, go_list, threshold_go_list,
                        pathway_ids_list):
    """
    Calculates the hypergeometric probability mass function evaluated at k

    N = number of GOs in study
    n = number of GOs in given pathway
    m = number of disease-associated GOs
    k = number of disease-associated GOs in given pathway
    """
    ## Initialize an empty pvalues DataFrame (really just an empty string) to
    ## return when there are errors
    pvalues_df = ''
    err = ''

    ## Convert TAB/CSV files into DataFrames
    pathway_df = csv_io.csv_to_data_frame(pathway_file, delimiter=',')
    stats_df = csv_io.csv_to_data_frame(stats_file)

    ## DEBUG
    #combined_full_df = stats_df.combineAdd(pathway_df.ix[list(go_list)])
    #print combined_full_df.to_string()

    ## Check that user supplied pathway_ids exist in the pathway_df
    s = set(pathway_df.columns.tolist())
    diff = [x for x in pathway_ids_list if x not in s]
    if len(diff) != 0:
        err = "ERROR: %s does not contain pathway_ids: %s" % (
               pathway_file, diff)
        return pvalues_df, err

    ## Number of GOs in study
    N = len(go_list)

    ## Number of disease-associated GOs (this is a sub-set DataFrame of only
    ## those values in the stats_file whose chosen statistic was above the 
    ## threshold)
    m = len(stats_df.ix[threshold_go_list])

    ## Initialize empty dictionary
    hyper_dict = {'N': [], 'n': [], 'm': [], 'k': [], 
                  'p_upper': [], 'p_lower': [], 'pvalue': []}

    ## Loop over pathway ids in the DataFrame
    for pw_id in pathway_ids_list:
        ## Number of disease-associated GOs in pathway pw_id
        #print pathway_df.ix[threshold_go_list][pw_id]
        k = int(pathway_df.ix[threshold_go_list][pw_id].sum())

        ## Number of GOs in pathway pw_id
        n = int(pathway_df.ix[go_list][pw_id].sum())

        ## Now calculate the p-values
        p_upper = float(sum(hypergeom.pmf(range(k,min(m,n)+1), N, m, n)))
        p_lower = float(1 - p_upper + hypergeom.pmf(k, N, m, n))
        pvalue = min(p_upper, p_lower)

        ## Save p-values to dictionary
        hyper_dict['N'].append(N)
        hyper_dict['n'].append(n)
        hyper_dict['m'].append(m)
        hyper_dict['k'].append(k)
        hyper_dict['p_upper'].append(p_upper)
        hyper_dict['p_lower'].append(p_lower)
        hyper_dict['pvalue'].append(pvalue)

        ## DEBUG
        #test = sum(hypergeom.pmf(range(0,k+1), N, m, n))
        #print "[%s] f(%s; %s, %s, %s) = %s vs. %s (%s)" % (
        #    pw_id, k, N, m, n, p_upper, p_lower, test)

    ## Format dictionary as a DataFrame
    pvalues_df = pd.DataFrame(hyper_dict, index=pathway_ids_list)

    ## Save DataFrame to output filename
    #pvalues_df.to_csv('path_pvals.dat', na_rep='NaN', sep='\t')

    return pvalues_df, err
Exemple #45
0
import sa, statistics, random_assembly
import networkx as nx
from scipy import stats
from scipy.stats import hypergeom
import sys

if __name__ == "__main__":
	curr_size = 6
	assemblies = sa.load_all(dir=sys.argv[1])
	intersect_num = []
	for i in range(0, len(assemblies)-1):
		for id, pi in assemblies[i].pieces.items():
			next_size = 6
			intersect = 0
			try:
				intersect = len(pi.neighbourhood.intersection(assemblies[i+1].pieces[id].neighbourhood))
				intersect_num.append(intersect)
			except KeyError:
				pass
			
	for i in range(0, curr_size+1):	
		print('Found', i, 'shared: ', sum(1 if x == i else 0 for x in intersect_num), round(100*sum(1 if x == i else 0 for x in intersect_num)/len(intersect_num), 2), \
			'%; expected: ', round(100*hypergeom.pmf(i, 100, curr_size, curr_size), 2), '%')
Exemple #46
0
def phyper(k, K, n, N):
    return hypergeom.pmf(k, N, n, K)