Exemple #1
0
def _jd_at_once(n1,k1,n2,k2,n3):
    if (n1,k1,n2,k2,n3) in joint_cache:
        return joint_cache[(n1,k1,n2,k2,n3)] 
    hg1 = spst.hypergeom(n1,k1,n3)
    hg2 = spst.hypergeom(n2,k2,n3)
    range1 = np.arange(0,min(n3+1,k1+1))
    range2 = np.arange(0,min(n3+1,k2+1))
    #print range1,range2
    if k1 == 0:
        pmf1 = np.zeros(len(range1))
        pmf1[0] = 1.0
    else:
        pmf1 = hg1.pmf(range1)
    if k2 == 0:
        pmf2 = np.zeros(len(range2))
        pmf2[0] = 1.0
    else:
        pmf2 = hg2.pmf(range2)
    #print pmf1,pmf2
    jpmf = np.outer(pmf1,pmf2)
    mrange1 = np.minimum.outer(range1,range2)
    mrange0 = np.minimum.outer(n3-range1,n3-range2)
    #print mrange1
    #print mrange0
    no_ops = np.logical_and(mrange1==0,mrange0==0)
    #print no_ops
    jpmf[no_ops] = 0.0
    jpmf/=np.sum(jpmf)
    joint_cache[(n1,k1,n2,k2,n3)] = (jpmf,mrange1,mrange0)
    return jpmf, mrange1, mrange0
Exemple #2
0
def coco_stats():
    """
    http://stattrek.com/online-calculator/hypergeometric.aspx

    CommandLine:
        python -m mtgmonte.stats --exec-coco_stats --show

    Example:
        >>> # DISABLE_DOCTEST
        >>> from mtgmonte.stats import *  # NOQA
        >>> result = coco_stats()
        >>> print(result)
        >>> ut.show_if_requested()
    """
    import plottool as pt
    from scipy.stats import hypergeom

    N = pop_size = 60  # cards in deck  # NOQA
    K = num_success = 21  # number of creatures in deck  # NOQA
    n = sample_size = 6  # cards seen by coco  # NOQA

    # prob of at least that many hits
    hypergeom
    prb = hypergeom(N, K, n)

    k = number_of_success = 1  # number of hits you want  # NOQA

    prb.pmf(k)  # P(X = k)
    #
    prb.cdf(k)  # P(X <= k)

    1 - prb.cdf(k)  # P(X > k)

    (1 - prb.cdf(k)) + prb.pmf(k)  # P(X >= k)

    def prob_ge(k, prb=prb):
        return (1 - prb.cdf(k)) + prb.pmf(k)  # P(X >= k)

    pt.ensure_pylab_qt4()

    import numpy as np

    k = np.arange(1, 3)

    K_list = np.arange(15, 30)

    label_list = [str(K_) + " creatures in deck" for K_ in K_list]

    ydata_list = [prob_ge(k, prb=hypergeom(N, K_, n)) for K_ in K_list]

    pt.multi_plot(
        k,
        ydata_list,
        label_list=label_list,
        title="probability of at least k hits with coco",
        xlabel="k",
        ylabel="prob",
        num_xticks=len(k),
        use_darkbackground=True,
    )
Exemple #3
0
    def fitness_group(self, x, i, j, *args):
        """
        In a population of x i-strategists and (Z-x) j strategists, where players
        interact in group of 'group_size' participants this function
        returns the average payoff of strategies i and j.

        Parameters
        ----------
        x : int
            number of individuals adopting strategy i in the population
        i : int
            index of strategy i
        j : int
            index of strategy j
        args : List
            Other Parameters. This can be used to pass extra parameters to functions
            stored in the payoff matrix

        Returns
        -------
            int
            Returns the difference in fitness between strategy i and j
        """
        k_array = np.arange(0, self.N, dtype=np.int32)
        i_pmf = hypergeom(self.Z-1, x-1, self.N-1).pmf(k_array)
        j_pmf = hypergeom(self.Z-1, x, self.N-1).pmf(k_array)

        fitness_i, fitness_j = 0, 0
        for k in k_array:
            fitness_i += self.payoffs[i, j](k + 1, self.N, *args)*i_pmf[k]
            fitness_j += self.payoffs[j, i](self.N - k, self.N, *args)*j_pmf[k]

        return fitness_i - fitness_j
def plot_hypergeom(M, N, n):
    x1 = range(min(n, N) + 1)
    x2 = range(n + 1)
    plt.plot(x1, hypergeom(M=M, n=n, N=N).pmf(x1), alpha=0.6, color='gray')
    plt.plot(x2,
             hypergeom(M=M, n=n, N=N).pmf(x2),
             'o',
             label='$n={0},N={1},M={2}$'.format(N, M, n))
    def test_entropy(self):
        # Simple tests of entropy.
        hg = stats.hypergeom(4, 1, 1)
        h = hg.entropy()
        expected_p = np.array([0.75, 0.25])
        expected_h = -np.sum(xlogy(expected_p, expected_p))
        assert_allclose(h, expected_h)

        hg = stats.hypergeom(1, 1, 1)
        h = hg.entropy()
        assert_equal(h, 0.0)
Exemple #6
0
def land_stats():
    """
    http://stattrek.com/online-calculator/hypergeometric.aspx

    CommandLine:
        python -m mtgmonte.stats --exec-land_stats --show

    Example:
        >>> # DISABLE_DOCTEST
        >>> from mtgmonte.stats import *  # NOQA
        >>> result = land_stats()
        >>> print(result)
        >>> ut.show_if_requested()
    """
    import plottool as pt
    from scipy.stats import hypergeom

    N = pop_size = 60  # cards in deck  # NOQA
    # K = num_success = 25  # lands in deck  # NOQA
    n = sample_size = 6  # cards seen by coco  # NOQA

    # prob of at least that many hits

    def prob_ge(k, prb):
        return (1 - prb.cdf(k)) + prb.pmf(k)  # P(X >= k)

    pt.ensure_pylab_qt4()

    N = deck_size = 60  # NOQA
    land_range = (24, 27 + 1)

    # N = deck_size = 40  # NOQA
    # land_range = (15, 18 + 1)

    xdata = range(0, 15)  # turn
    ydata_list = [[hypergeom(N, K, x + 7).expect() for x in xdata] for K in range(*land_range)]
    spread_list = [[hypergeom(N, K, x + 7).std() for x in xdata] for K in range(*land_range)]
    # spread_list = None
    import numpy as np

    label_list = ["%d lands" % (K,) for K in range(*land_range)]
    pt.multi_plot(
        xdata, ydata_list, spread_list=spread_list, label_list=label_list, num_xticks=15, num_yticks=13, fnum=1
    )
    min_lands_acceptable = np.minimum(np.array(xdata), [1, 2, 3, 4, 5, 6] + [6] * (len(xdata) - 6))
    pt.multi_plot(
        xdata,
        [min_lands_acceptable, (np.array(xdata) ** 0.9) * 0.5 + 4],
        label_list=["minimum ok", "maximum ok"],
        num_xticks=15,
        num_yticks=13,
        fnum=1,
        marker="o",
    )
Exemple #7
0
 def prob_nohave_card_always_mulled(copies=2, hand_size=3):
     # probability of getting the card initially
     p_none_premul = hypergeom(deck_size, copies, hand_size).cdf(0)
     # probability of getting the card if everything is thrown away
     # (TODO: factor in the probability that you need to keep something)
     # for now its fine because if we keep shadowform the end calculation is fine
     p_nohave_postmul_given_nohave = hypergeom(deck_size - hand_size, copies, hand_size).cdf(0)
     # not necessary, but it shows the theory
     p_nohave_postmul_given_had = 1
     p_nohave_turn0 = (
         p_nohave_postmul_given_nohave * p_none_premul + (1 - p_none_premul) * p_nohave_postmul_given_had
     )
     return p_nohave_turn0
Exemple #8
0
def fisher_exact(table, side="two.sided", zero_correction=True):
    """Computes fisher exact odds ratio.
    Output is almost exactly the same as scipy.stats.fisher_exact but here allows for
    using Haldane–Anscombe correction (substitutes 0.5 for 0 values in the table, whereas
    the scipy.stats version and R version fisher.test use integers only).
    For 95% confidence interval, uses confidence intervals computed by R function fisher.test
    """
    if side not in ("greater", "less", "two.sided"):
        raise ValueError(
            "side parameter must be one of 'greater', 'less', or 'two.sided'")

    # Compute the p value
    # For all possible contingency tables with the observed marginals, compute the hypergeom
    # pmf of that table. Sum the p of all tables with p less than or equal to the hypergeom
    # probability of the observed table.
    N = np.sum(table)
    K = np.sum(table[:, 0])
    n = np.sum(table[0])

    odds_ratio, se = _odds_ratio(table, zero_correction=zero_correction)

    a_min = np.max([0, table[0][0] - table[1][1]])
    a_max = np.min([K, n])

    p_observed = hypergeom(N, K, n).pmf(table[0][0])
    p_value = 0.0
    for a in np.arange(a_min, a_max + 1):
        possible_table = np.array([[a, n - a], [K - a, N - n - K + a]])
        p = hypergeom(N, K, n).pmf(a)

        if side == "greater":
            if _odds_ratio(possible_table)[0] >= odds_ratio:
                p_value += p
        elif side == "less":
            if _odds_ratio(possible_table)[0] <= odds_ratio:
                p_value += p
        elif side == "two.sided":
            if p <= p_observed:
                p_value += p

    if side == "greater":
        interval95 = [np.exp(np.log(odds_ratio) - (1.645 * se)), np.inf]
    elif side == "less":
        interval95 = [0, np.exp(np.log(odds_ratio) + (1.645 * se))]
    elif side == "two.sided":
        interval95 = [
            np.exp(np.log(odds_ratio) - (1.96 * se)),
            np.exp(np.log(odds_ratio) + (1.96 * se))
        ]

    return odds_ratio, np.array(interval95), p_value
Exemple #9
0
        def prob_nohave_card_never_mulled(copies=2, hand_size=3):
            deck_size = 30
            prb = hypergeom(deck_size, copies, hand_size)
            # P(initial_miss)
            p_none_premul = prb.cdf(0)

            # GIVEN that we mul our first 3 what is prob we still are unlucky
            # P(miss_turn0 | initial_miss)
            prb = hypergeom(deck_size - hand_size, copies, hand_size)
            p_none_in_mul = prb.cdf(0)
            # TODO: add constraints about 2 drops
            #  P(miss_turn0) = P(miss_turn0 | initial_miss) *  P(initial_miss)
            p_none_at_start = p_none_in_mul * p_none_premul
            return p_none_at_start
Exemple #10
0
def Exp1(A, m, r, k):
    """
    Compute the expected number buckets that has collision less or equal 10 by applying approximation1
    (Usually apply this approximation when A>500)
    :param A: number of all patients
    :param m: number of buckets
    :param r: ratio of umber of patients satisfying certain criteria t0number of all patients
    :param k: K in K-anonymity
    :return: Expectation by applying approximation1
    """

    B = int(A * r)  # number of patients satisfying certain criteria
    expectation = Decimal(0)
    alpha = 1 - 1 / (2 * m)
    # Restrit an interval for single bucket size (|A1| in formula) with probability greater than 1-alpha
    rv_a = binom(A, 1 / m)
    (lb_a, ub_a) = rv_a.interval(alpha)
    rv_b = hypergeom(A, int(lb_a), B)
    (lb_b, ub_b) = rv_b.interval(alpha)
    # Rule out the case that there is no collision
    if lb_b == 0 or lb_a == 0:
        for a in range(int(lb_a), int(ub_a) + 1):
            if a > k:
                # Find lowerbound and upperbound for B1
                rv_b = hypergeom(A, a, B)
                (lb_b, ub_b) = rv_b.interval(alpha)
                # Rule out the case that there is no collision
                lb_b = max(1, lb_b)
                # Compute P(|e| < k | |A1|)
                p = P(lb_b, ub_b, k, rv_b, a)
                #Compute Expectation
                expectation = expectation + p * Decimal(rv_a.pmf(a))
            else:
                rv_b = hypergeom(A, a, B)
                expectation = expectation + Decimal(
                    rv_a.pmf(a)) * (1 - Decimal(rv_b.pmf(0)))
    else:
        for a in range(int(lb_a), int(ub_a) + 1):
            # when  |A1| < k, P(|e| <= k | A1,B1) = 0
            if a > k:
                # Restrit an interval for B1 with probability greater than 0.99995
                rv_b = hypergeom(A, a, B)
                (lb_b, ub_b) = rv_b.interval(0.99995)
                # Compute P(|e|<=k | |A1|)
                p = P(lb_b, ub_b, k, rv_b, a)
                #Compute Expectation
                expectation = expectation + p * Decimal(rv_a.pmf(a))
            else:
                expectation = expectation + Decimal(rv_a.pmf(a))
    return round(expectation * m, 5)
Exemple #11
0
    def add_counting_bound_constraints_1(self):
        """Adds counting bound, for a given number of cliques zonked.

        """
        # FIXME this is half-baked
        # the probability of some number of cliques being zonked

        # loop through the number of cliques left over
        for j in range(self.max_cliques_remaining + 1):

            # for i in range(self.max_cliques_zeroed+1):

            # bounds on number of cliques containing edge e
            # (these won't actually be zeroed)
            min_cliques_zeroed = max(0,
                                     num_cliques - self.max_cliques_remaining)
            max_cliques_zeroed = min(num_cliques, self.max_cliques_zeroed)
            # the probability of some number of cliques containing edge e
            h = hypergeom(
                # number of possible cliques
                self.max_cliques,
                # number of those present
                num_cliques,
                # number of cliques which could intersect edge e
                max_cliques_zeroed)
            # here, z is the number of cliques which _do_ intersect edge e
            A = [((z, num_cliques - z), h.pmf(z))
                 for z in range(min_cliques_zeroed, max_cliques_zeroed + 1)]
            # the bound is half the number of functions
            b = (comb(self.max_cliques, num_cliques, exact=True) - 1) / 2
            self.add_constraint(A, b)
Exemple #12
0
    def add_total_cliques_counting_bound_constraints(self):
        """Adds counting bound, based on total number of cliques.

        For each "level" of "total number of cliques found", this
        adds a bound, based on the counting bound.
        """
        # loop through the number of cliques
        for num_cliques in range(self.max_cliques + 1):
            # bounds on number of cliques containing edge e
            # (these won't actually be zeroed)
            min_cliques_zeroed = max(0,
                                     num_cliques - self.max_cliques_remaining)
            max_cliques_zeroed = min(num_cliques, self.max_cliques_zeroed)
            # the probability of some number of cliques containing edge e
            h = hypergeom(
                # number of possible cliques
                self.max_cliques,
                # number of those present
                num_cliques,
                # number of cliques which could intersect edge e
                max_cliques_zeroed)
            # here, z is the number of cliques which _do_ intersect edge e
            A = [((z, num_cliques - z), h.pmf(z))
                 for z in range(min_cliques_zeroed, max_cliques_zeroed + 1)]
            # the bound is half the number of functions
            b = (comb(self.max_cliques, num_cliques, exact=True) - 1) / 2
            self.add_constraint(A, b)
    def test_discrete_induced_sampling(self):
        nmasses1 = 10
        mass_locations1 = np.geomspace(1.0, 512.0, num=nmasses1)
        #mass_locations1 = np.arange(0,nmasses1)
        masses1 = np.ones(nmasses1, dtype=float) / nmasses1
        var1 = float_rv_discrete(name='float_rv_discrete',
                                 values=(mass_locations1, masses1))()
        nmasses2 = 10
        mass_locations2 = np.arange(0, nmasses2)
        # if increase from 16 unmodififed becomes ill conditioned
        masses2 = np.geomspace(1.0, 16.0, num=nmasses2)
        #masses2  = np.ones(nmasses2,dtype=float)/nmasses2
        masses2 /= masses2.sum()
        var2 = float_rv_discrete(name='float_rv_discrete',
                                 values=(mass_locations2, masses2))()
        self.help_discrete_induced_sampling(var1, var2, 30)

        num_type1, num_type2, num_trials = [10, 10, 9]
        var1 = stats.hypergeom(num_type1 + num_type2, num_type1, num_trials)
        var2 = var1
        self.help_discrete_induced_sampling(var1, var2, 300)

        num_type1, num_type2, num_trials = [10, 10, 9]
        var1 = stats.binom(10, 0.5)
        var2 = var1
        self.help_discrete_induced_sampling(var1, var2, 300)

        N = 10
        xk, pk = np.arange(N), np.ones(N) / N
        var1 = float_rv_discrete(name='discrete_chebyshev', values=(xk, pk))()
        var2 = var1
        self.help_discrete_induced_sampling(var1, var2, 30)
Exemple #14
0
def _calc_score(
    fore_hit_size,
    fore_size,
    back_hit_size,
    back_size,
    prob_fn=None,
):
    if prob_fn is None:
        prob_fn = 'hypergeom'

    assert prob_fn in ['hypergeom', 'binom']

    if back_hit_size <= 0:
        return 0

    k = fore_hit_size
    n = fore_size
    K = back_hit_size
    N = back_size
    p = K / N

    if prob_fn == 'hypergeom':
        binomial = stats.hypergeom(N, K, n)
    else:
        binomial = stats.binom(n, p)

    pr_gt_k = binomial.sf(k - 1)
    pr_lt_k = binomial.cdf(k)

    if pr_lt_k <= 0:
        return -200
    elif pr_gt_k <= 0:
        return 200
    else:
        return -np.log10(pr_gt_k / pr_lt_k)
Exemple #15
0
    def get_enrichment_score(self, query_id_set_n, M, overlap_n):
     #   overlap = query_id_set.set & self.set
     #   k = len(overlap)
        pv = hypergeom(M, self.n, query_id_set_n).sf(overlap_n)
        print "m=" +str(M) + " n=" + str(self.n) + " q=" + str(query_id_set_n) + " k=" + str(overlap_n) + " pv=" + str(pv)

        return pv # EnrichmentScore(pv, k, overlap, self.name)
Exemple #16
0
def run(domain_name='X', projection_name='Y8'):
    prob2 = sio.loadmat('prob2.mat')

    domain_names = ['X', 'XV', 'Y', 'Y8', 'Y12']

    domains = [prob2.get(d) for d in domain_names]
    #domain_clusters = [prob2.get('ids_' + d) for d in domain_names]
    tissue_clusters = prob2.get('tissue_category')

    clusters = domain_clusters[domain_names.index(domain_name)]
    pdom = domains[domain_names.index(projection_name)]
    cdom = domains[domain_names.index(domain_name)]

    f = plt.figure(1)
    f.clear()
    random.seed(1)
    ct = array(mc.getct(218))

    #px, py = 2, 2
    sstrings = ['21{0:d}'.format(i + 1) for i in range(4)]

    inds = arange(shape(dom)[1])

    c_inds = array(clusters).flatten() - 1
    tc_inds = tissue_clusters.flatten() - 1

    colors = ct[c_inds, :]

    ax = f.add_subplot(sstrings[0], title = \
                         'Clusters from genespace affinity. Projection to first two elements')
    ax.scatter(*cdom[inds, 0:2].T, s=100, c=colors)
    ax = f.add_subplot(sstrings[1], title = \
                       'Clusters from genespace affinity. Projection to MVE')
    ax.scatter(*pdom[inds, 0:2].T, s=100, c=colors)

    cpairs = set([
        '{0:d}x{1:d}'.format(ix, iy) for ix, x in enumerate(c_inds)
        for iy, y in enumerate(c_inds) if ix < iy and x == y
    ])
    tcpairs = set([
        '{0:d}x{1:d}'.format(ix, iy) for ix, x in enumerate(tc_inds)
        for iy, y in enumerate(tc_inds) if ix < iy and x == y
    ])
    f.savefig('figs/cluster_projectsions.tiff', format='tiff')

    max_pairs = (len(tc_inds) * len(tc_inds) - len(tc_inds)) / 2
    total_pairs = len(cpairs.union(tcpairs))
    shared_pairs = len(cpairs.intersection(tcpairs))

    print 'using affinity propagation with affinites over domain {0}'.format(
        domain_name)
    print 'found'
    print ' max pairs: {0}'.format(max_pairs)
    print ' total pairs: {0}'.format(total_pairs)
    print ' tissue pairs: {0}'.format(len(tcpairs))
    print ' cluster pairs: {0}'.format(len(cpairs))
    print ' shared pairs: {0}'.format(shared_pairs)

    hg = hypergeom(len(tcpairs), len(cpairs), max_pairs)
    return hg
Exemple #17
0
    def test_get_univariate_leja_rule_bounded_discrete(self):
        growth_rule = partial(constant_increment_growth_rule, 2)
        level = 3

        nmasses = 20
        xk = np.array(range(0, nmasses), dtype='float')
        pk = np.ones(nmasses) / nmasses
        var_cheb = float_rv_discrete(name='discrete_chebyshev',
                                     values=(xk, pk))()

        for variable in [
                var_cheb,
                stats.binom(17, 0.5),
                stats.hypergeom(10 + 10, 10, 9)
        ]:
            quad_rule = get_univariate_leja_quadrature_rule(
                variable, growth_rule)

            x, w = quad_rule(level)
            loc, scale = transform_scale_parameters(variable)
            x = x * scale + loc

            xk, pk = get_probability_masses(variable)
            print(x, xk, loc, scale)

            degree = (x.shape[0] - 1)
            true_moment = (xk**degree).dot(pk)
            moment = (x**degree).dot(w[-1])

            print(moment, true_moment, variable.dist.name)
            assert np.allclose(moment, true_moment)
Exemple #18
0
def hypergeom_p_values(data, selected, callback=None):
    """
    Calculates p_values using Hypergeometric distribution for two numpy arrays.
    Works on a matrices containing zeros and ones. All other values are truncated to zeros and ones.

    :param data: all examples in rows, theirs features in columns
    :type data: numpy.array
    :param selected: selected examples in rows, theirs features in columns
    :type selected: numpy.array
    :return: p-values for features
    """
    if data.shape[1] != selected.shape[1]:
        raise ValueError("Number of columns does not match.")

    # clip values to a binary variables
    data = data > 0
    selected = selected > 0

    num_features = selected.shape[1]
    pop_size = data.shape[0]                # population size = number of all data examples
    sam_size = selected.shape[0]            # sample size = number of selected examples
    pop_counts = np.sum(data, axis=0)       # number of observations in population = occurrences of words all data
    sam_counts = np.sum(selected, axis=0)   # number of observations in sample = occurrences of words in selected data

    step = 250
    p_vals = []

    for i, (pc, sc) in enumerate(zip(pop_counts, sam_counts)):
        hyper = stats.hypergeom(pop_size, pc, sam_size)
        # since p-value is probability of equal to or "more extreme" than what was actually observed
        # we calculate it as 1 - cdf(sc-1). sf is survival function defined as 1-cdf.
        p_vals.append(hyper.sf(sc-1))
        if callback and i % step == 0:
            callback(100*i/num_features)
    return p_vals
Exemple #19
0
def hyper(N,M,n,m): 
    ''' Function defines the parameters for a hypergeometric test that returns a p-value representing the chances of identifying >= x, where x is the number of successes '''  
    frozendist=hypergeom(N,M,n)
    ms=np.arange(m, min(n+1, M+1))
    rv=0;
    for single_m in ms: rv=rv+frozendist.pmf(single_m)
    return rv
Exemple #20
0
def pvalue(N, M, n, m):
    N = deepcopy(N)
    M = deepcopy(M)
    n = deepcopy(n)
    m = deepcopy(m)
    maxlen = max([length(N), length(M), length(n), length(m)])
    if maxlen > 1:
        if length(N) == 1:
            N = [N for i in range(maxlen)]
        elif length(N) != maxlen:
            raise ValueError('Inequally long vectors have been provided to this function')
        if length(M) == 1:
            M = [M for i in range(maxlen)]
        elif length(M) != maxlen:
            raise ValueError('Inequally long vectors have been provided to this function')
        if length(n) == 1:
            n = [n for i in range(maxlen)]
        elif length(n) != maxlen:
            raise ValueError('Inequally long vectors have been provided to this function')
        if length(m) == 1:
            m = [m for i in range(maxlen)]
        elif length(m) != maxlen:
            raise ValueError('Inequally long vectors have been provided to this function')
        return [pvalue(N[i],M[i],n[i],m[i]) for i in range(maxlen)]
    else:
        hg = sps.hypergeom(N, M, n)
        if m > M or m > n:
            m = min(M, n)
        return sum(hg.pmf(np.arange(m, min(M + 1, n + 1))))
def compute_clusters_ps(predicted_clusters, goa_clusters):
    predicted_clusters = {
        a: p
        for a, p in predicted_clusters.items() if len(p) >= 3
    }
    goa_clusters = {a: p for a, p in goa_clusters.items() if len(p) >= 3}
    n_total_proteins = sum(len(p) for p in goa_clusters.values())

    top_p_values = {}

    for predict_cluster, predict_proteins in tqdm(predicted_clusters.items()):
        p_value = float('inf')
        top_goa_cluster = None
        for goa_cluster, goa_proteins in goa_clusters.items():
            n_goa_proteins = len(goa_proteins)
            n_predicted_proteins = len(predict_proteins)
            n_proteins_from_goa = len(
                goa_proteins.intersection(predict_proteins))
            goa_c_p_value = ss.hypergeom(
                n_total_proteins, n_goa_proteins,
                n_predicted_proteins).sf(n_proteins_from_goa - 1)
            if goa_c_p_value < p_value:
                p_value = goa_c_p_value
                top_goa_cluster = goa_cluster
        top_p_values[predict_cluster] = (top_goa_cluster, p_value)

    return top_p_values
Exemple #22
0
    def add_total_cliques_equality_constraints(self):
        """Adds constraints for a given total number of cliques.

        For 0 <= m <= N, these define a variable '(total_cliques, m)',
        which is E[ number of gates need to find m cliques ],
        or "the expected number of gates needed at 'level m'".
        It's constrained to equal the weighted average of FIXME describe this.
        """
        # loop through the number of cliques
        for num_cliques in range(self.max_cliques + 1):
            # bounds on number of cliques containing edge e
            # (these won't actually be zeroed)
            min_cliques_zeroed = max(0,
                                     num_cliques - self.max_cliques_remaining)
            max_cliques_zeroed = min(num_cliques, self.max_cliques_zeroed)
            # the probability of some number of cliques containing edge e
            h = hypergeom(
                # number of possible cliques
                self.max_cliques,
                # number of those present
                num_cliques,
                # number of cliques which could intersect edge e
                max_cliques_zeroed)
            # here, z is the number of cliques which _do_ intersect edge e
            A = [((z, num_cliques - z), h.pmf(z))
                 for z in range(min_cliques_zeroed, max_cliques_zeroed + 1)]
            # this is constraining the total number of gates at this "level"
            # to equal the average, weighted by the probability of some
            # number of cliques being zeroed out
            self.add_constraint(A + [(('total_cliques', num_cliques), -1.0)],
                                0, True)
Exemple #23
0
def get_enriched(all_genes, selection, name, method, cutoff, print_all):
    """Get enrichment for pfam domains."""
    all_counts = Counter(all_genes)
    sel_counts = Counter(selection)
    df = pd.DataFrame({
        "all": pd.Series(all_counts),
        name: pd.Series(sel_counts)
    }).fillna(0)

    # Hypergeometric test
    M = df["all"].sum()
    N = df[name].sum()
    df["p_value"] = df.apply(
        lambda x: hypergeom(M, x["all"], N).sf(x[name] - 1), axis=1)

    # Multiple test correction
    corr = "fdr_bh" if method == "bh" else method
    df[corr] = multipletests(df["p_value"], method=corr)[1]
    df = df.sort_values(corr)
    df["significant"] = df[corr] <= cutoff

    # Add pfam domain and description columns
    df = df.reset_index().rename(columns={"index": "pfam_domain"})

    if not print_all:
        df = df.loc[df["significant"]]

    df.insert(1, "description", df["pfam_domain"].map(get_pfam_desc))
    return df
Exemple #24
0
def find_hypergeometric(genes, pred_no_training):

    overlap = list(set(genes) & set(pred_no_training))
    M = 10683
    #M=20000
    N = len(genes)
    n = len(pred_no_training)
    x = len(overlap)
    pval = hypergeom.sf(x - 1, M, n, N)

    rv = hypergeom(M, n, N)
    distr = np.arange(0, n + 1)
    #print (N, n, x)
    prob = rv.pmf(distr)

    maximum = np.max(prob)
    result = np.where(prob == maximum)
    #print (result)
    #result=result.tolist()
    result = result[0]
    #print (result)
    fold = x / result
    fold = fold.tolist()
    print('Fold Enrichment', fold)
    print('hypergeometric p-value', pval)
    return fold
def hypergeometric_test(X, cluster, treshold):
    # type: (np.ndarray, np.ndarray, float) -> np.ndarray

    scores = np.zeros((X.shape[1],))

    # Binary expression matrix
    Y = (X >= treshold).astype(int)

    # Process each gene
    for gi, g in enumerate(Y.T):
        # Test parameters
        M = X.shape[0]  # Number of cells
        n = g.sum()  # Number of cells expressing g
        N = len(cluster)  # Number of cells belonging to cluster(s)
        hg = hypergeom(M, n, N)

        # Test for over expression
        x = g[cluster].sum()
        x_over = np.arange(x, n + 1)  # x or more
        pvalue_over = hg.pmf(x_over).sum()

        # Test for under expression
        x_under = np.arange(0, x + 1)  # x or less
        pvalue_under = hg.pmf(x_under).sum()

        # Proposed scoring:
        p = min(pvalue_under, pvalue_over)
        s = -1 if pvalue_under < pvalue_over else 1
        score = -np.log(p) * s
        scores[gi] = score

    return scores
Exemple #26
0
    def test_get_univariate_leja_rule_bounded_discrete(self):
        from scipy import stats
        growth_rule = partial(constant_increment_growth_rule, 2)
        level = 3

        nmasses = 20
        xk = np.array(range(0, nmasses), dtype='float')
        pk = np.ones(nmasses) / nmasses
        var_cheb = float_rv_discrete(name='discrete_chebyshev',
                                     values=(xk, pk))()

        for variable in [
                var_cheb,
                stats.binom(20, 0.5),
                stats.hypergeom(10 + 10, 10, 9)
        ]:
            quad_rule = get_univariate_leja_quadrature_rule(
                variable, growth_rule)

            # polys of binom, hypergeometric have no canonical domain [-1,1]
            x, w = quad_rule(level)

            from pyapprox.variables import get_probability_masses
            xk, pk = get_probability_masses(variable)
            true_moment = (xk**(x.shape[0] - 1)).dot(pk)
            moment = (x**(x.shape[0] - 1)).dot(w[-1])

            assert np.allclose(moment, true_moment)
Exemple #27
0
def family_hg(cluster_p_val_dict, mol_families, p_thresh=0.01):
    # takes as input a dictionary that maps clusters to their p-vals
    # the key is a cluster id, and the value is another dictionary
    # that should have a pval field
    from scipy.stats import hypergeom
    import numpy as np
    # compute the hypergeometric business
    fam_clust_sig = []
    for mf in mol_families:
        local_n_sig = 0
        n_clu = 0
        for c in mf.clusters:
            if c.cluster_id in cluster_p_val_dict:
                n_clu += 1
                if cluster_p_val_dict[c.cluster_id]['pval'] <= p_thresh:
                    local_n_sig += 1
        fam_clust_sig.append((mf, n_clu, local_n_sig))

    N = len(cluster_p_val_dict)
    pvallist = []
    for c in cluster_p_val_dict:
        pvallist.append(cluster_p_val_dict[c]['pval'])

    n_sig = len(list(filter(lambda x: x <= p_thresh, pvallist)))
    fam_clust_sig_hyp = []
    for fam, n_clu, local_n_sig in fam_clust_sig:
        rv = hypergeom(N, n_sig, n_clu)
        poss = np.arange(local_n_sig, n_clu + 1)
        hypp = rv.pmf(poss).sum()
        fam_clust_sig_hyp.append((fam, hypp, n_clu, local_n_sig))
    fam_clust_sig_hyp.sort(key=lambda x: x[1])
    return fam_clust_sig_hyp
Exemple #28
0
def generate_scores(_ids, _scores, _spectra, _kernel, _params):
    res = _params['fragment mass tolerance']
    sfactor = 20
    sadjust = 1
    if res > 100:
        sfactor = 40
    sd = {}
    for j in _ids:
        p_score = 0.0
        if not _ids[j]:
            continue
        for i in _ids[j]:
            kern = _kernel[i]
            lseq = list(kern['seq'])
            pmass = int(kern['pm'] / 1000)
            cells = int(pmass - 200)
            if cells > 1500:
                cells = 1500
            total_ions = 2 * (len(lseq) - 1)
            if total_ions > sfactor:
                total_ions = sfactor
            if total_ions < _scores[j]:
                total_ions = _scores[j] + 1
            sc = len(_spectra[j]['sms']) / 3
            if _scores[j] >= sc:
                sc = _scores[j] + 2
            rv = hypergeom(cells, total_ions, sc)
            p = rv.pmf(_scores[j])
            pscore = -100.0 * math.log10(p) * sadjust
            sd[(j, i)] = pscore
    return sd
Exemple #29
0
def _calc_score(
    fore_hit_size, fore_size, back_hit_size, back_size,
    prob_fn=None,
):
    if prob_fn is None:
        prob_fn = "hypergeom"

    assert prob_fn in ["hypergeom", "binom"]

    if back_hit_size <= 0:
        return 0

    k = fore_hit_size
    n = fore_size
    K = back_hit_size
    N = back_size
    p = K / N

    if prob_fn == "hypergeom":
        binomial = stats.hypergeom(N, K, n)
    else:
        binomial = stats.binom(n, p)

    pr_gt_k = binomial.sf(k - 1)
    pr_lt_k = binomial.cdf(k)

    if pr_lt_k <= 0:
        return -200
    elif pr_gt_k <= 0:
        return 200
    else:
        return -np.log10(pr_gt_k / pr_lt_k)
Exemple #30
0
 def chug_count_distribution(cls, player_count):
     # Exact probability
     N = 13 * player_count
     K = player_count
     n = 13
     rv = hypergeom(N, K, n)
     return rv.pmf, f"HyperGeometric({N}, {K}, {n})"
Exemple #31
0
    def __init__(self,
                 M,
                 n,
                 N,
                 interval_shift=0,
                 order=2,
                 type='aleatory',
                 name='',
                 number=0):

        if M < 0:
            raise VariableInputError(
                'HypergeometricVariable M must be greater or equal to 0.')

        if n < 0:
            raise VariableInputError(
                'HypergeometricVariable n must be greater or equal to 0.')

        if (N < 1) or (N > (M + n)):
            raise VariableInputError(
                'HypergeometricVariable M must be greater than 1 and less than M+n.'
            )

        self.interval_shift = interval_shift
        self.order = order
        self.M = M
        self.n = n
        self.N = N
        self.type = UncertaintyType.from_name(type)
        self.name = f'x{number}' if name == '' else name
        self.var_str = f'x{number}'
        self.x = symbols(self.var_str)

        self.distribution = Distribution.HYPERGEOMETRIC

        self.dist = hypergeom(M=self.M + self.n,
                              n=self.n,
                              N=self.N,
                              loc=self.interval_shift)

        self.find_high_lim()
        self.get_probability_density_func()
        self.check_num_string()

        self.recursive_var_basis(self.x_values, self.probabilities, self.order)
        self.create_norm_sq(self.x_values, self.probabilities)

        self.low_approx = np.min(self.x_values)
        self.high_approx = np.max(self.x_values)
        self.std_bounds = (self.low_approx, self.high_approx)

        self.check_bounds()

        if self.type == UncertaintyType.EPISTEMIC:
            warn('The HypergeometricVariable is usually not epistemic. For an '
                 'epistemic variable, consider using the continuous uniform '
                 'distribution with type epistemic.')

        showwarning = _warn
Exemple #32
0
def calc_pvalue(gene_list, gene_set, M):
    gene_list = set(gene_list)
    gene_set = set(gene_set)
    N = len(gene_list)
    n = len(gene_set)
    overlap = gene_list & gene_set
    k = len(overlap)
    return hypergeom(M, n, N).sf(k), list(overlap)
 def test_hypergeometric(self):
     N = 20
     K = 3
     p = hypergeometric(N, K)
     scipy_p = np.array(
         [hypergeom(N, K, n).pmf(range(0, K + 1)) for n in range(N + 1)])
     err = np.abs(p - scipy_p).max()
     self.assertTrue(err < 1e-10)
Exemple #34
0
def calc_enrichment_score(n, o, M, N):
    # M = number of strains screened
    # n = number of screened strains with attribute
    # N = number of active strains
    # o = number of active strains with attribute
    rv = hypergeom(M, n, N)
    p_val = rv.sf(o - 1)
    return p_val
Exemple #35
0
 def variance_function(theta):
     rounded_m_theta = round(theta * M)
     TP_rv = hypergeom(M=M, n=P, N=round(theta * M))
     return sum([
         TP_rv.pmf(x) * (given_x_function(x, theta)**2)
         for x in range(int(max(0, rounded_m_theta - N)),
                        int(min((P + 1, rounded_m_theta + 1))))
     ])
 def calc_pvalue(query_id_set, reference_id_set, M):
     query_id_set = set(query_id_set)
     reference_id_set = set(reference_id_set)
     N = len(query_id_set)
     n = len(reference_id_set)
     overlap = query_id_set & reference_id_set
     k = len(overlap)
     return hypergeom(M, n, N).sf(k), list(overlap)
Exemple #37
0
 def hypergeometric_cdf(self, N, K, n, k):
     """
   N= total number of genes in population
   K= number of GOA
   n= select a sample (top 50, bottom half, etc.)
   k= number of successes in the sample
 """
     return 1 - hypergeom(N, K, n).cdf(k)
Exemple #38
0
    def get_probability_density_func(self):
        """
        Calculates the probabilities for the HypergeomericVariable 
        x_values.
        """
        dist = hypergeom(M=self.M + self.n, n=self.n, N=self.N)

        self.probabilities = dist.pmf(self.x_values)
Exemple #39
0
 def hypergeometric_cdf(self, N, K, n, k):
   """
     N= total number of genes in population
     K= number of GOA
     n= select a sample (top 50, bottom half, etc.)
     k= number of successes in the sample
   """
   return 1 - hypergeom(N, K, n).cdf(k)
Exemple #40
0
def test_prob(num_hits, pop_size, num_draws, num_matching=1):
    """Perform a hypergeometric test to see the probability of drawing the
    same entity num_hits many times. Need to check math

    """
    dist = hypergeom(pop_size, num_matching, num_draws)
    pval = dist.sf(num_hits - 1)
    return pval
Exemple #41
0
def calc_pvalue(gene_list, gene_set, M):
    gene_list = set(gene_list)
    gene_set = set(gene_set)
    N = len(gene_list)
    n = len(gene_set)
    overlap = gene_list & gene_set
    k = len(overlap)
    return hypergeom(M, n, N).sf(k), list(overlap)
Exemple #42
0
def run( domain_name = 'X', projection_name = 'Y8'  ):
  prob2 = sio.loadmat('prob2.mat')
  
  domain_names = ['X', 'XV', 'Y', 'Y8', 'Y12']
  
  domains = [prob2.get(d) for d in domain_names]
  #domain_clusters = [prob2.get('ids_' + d) for d in domain_names]
  tissue_clusters = prob2.get('tissue_category')
  

  clusters = domain_clusters[domain_names.index(domain_name)]
  pdom = domains[domain_names.index(projection_name)]
  cdom = domains[domain_names.index(domain_name)]

  f = plt.figure(1)
  f.clear() 
  random.seed(1)
  ct = array(mc.getct(218))
  
  #px, py = 2, 2
  sstrings = ['21{0:d}'.format(i+1) for i in range(4)]
  
  inds = arange(shape(dom)[1])
  
  c_inds = array(clusters).flatten() -1
  tc_inds = tissue_clusters.flatten() -1

  colors = ct[c_inds,:]

  ax = f.add_subplot(sstrings[0], title = \
                       'Clusters from genespace affinity. Projection to first two elements')  
  ax.scatter(*cdom[inds,0:2].T,s= 100, c = colors)
  ax = f.add_subplot(sstrings[1], title = \
                     'Clusters from genespace affinity. Projection to MVE')  
  ax.scatter(*pdom[inds,0:2].T,s= 100, c = colors)

  cpairs = set(['{0:d}x{1:d}'.format(ix,iy) 
                for ix, x in enumerate(c_inds) for iy, y in enumerate(c_inds)
                if ix < iy and x == y ])
  tcpairs = set(['{0:d}x{1:d}'.format(ix,iy) 
                for ix, x in enumerate(tc_inds) for iy, y in enumerate(tc_inds)
                if ix < iy and x == y ])
  f.savefig('figs/cluster_projectsions.tiff',format = 'tiff')
  
  max_pairs =( len(tc_inds) * len(tc_inds)  - len(tc_inds)) / 2
  total_pairs = len(cpairs.union(tcpairs))
  shared_pairs =len(cpairs.intersection(tcpairs))

  print 'using affinity propagation with affinites over domain {0}'.format(domain_name)
  print 'found'
  print ' max pairs: {0}'.format(max_pairs)
  print ' total pairs: {0}'.format(total_pairs)
  print ' tissue pairs: {0}'.format(len(tcpairs))
  print ' cluster pairs: {0}'.format(len(cpairs))
  print ' shared pairs: {0}'.format(shared_pairs)

  hg =  hypergeom( len(tcpairs), len(cpairs), max_pairs )
  return hg
Exemple #43
0
def p_value(num_genes,
            num_genes_int_top_list,
            num_top_genes,
            total_genes=4000):
    rv = hypergeom(total_genes, num_top_genes, num_genes)
    p = rv.sf(num_genes_int_top_list - 1)
    if isnan(p):  # old version of hypergeom.sf() gives NaN, yuck
        p = rv.pmf(range(num_genes_int_top_list, num_genes + 1)).sum()
    return p
def hyper_test2(X, K, n, N):
    """
    Hypergeometric test for overexpression. Gives the probability that there are X or more events A 
    over n occurences given the total number of event K over the total number of occurences N.
    For underexpression. Note that this is note 1 - the previous, because 
    we are computing the probability that x is equal or less than X
    TODO: improve with cdf
    """
    return sum([hypergeom(N, n, K).pmf(x) for x in range(X+1)])
Exemple #45
0
 def test_rvs(self):
     vals = stats.hypergeom.rvs(20, 10, 3, size=(2, 50))
     assert numpy.all(vals >= 0) & numpy.all(vals <= 3)
     assert numpy.shape(vals) == (2, 50)
     assert vals.dtype.char in typecodes["AllInteger"]
     val = stats.hypergeom.rvs(20, 3, 10)
     assert isinstance(val, int)
     val = stats.hypergeom(20, 3, 10).rvs(3)
     assert isinstance(val, numpy.ndarray)
     assert val.dtype.char in typecodes["AllInteger"]
def hyper_test(X, K, n, N):
    """
    Hypergeometric test for overexpression. Gives the probability that there are X or more events A 
    over n occurences given the total number of event K over the total number of occurences N.
    X: Number of events
    K: Total number of events
    n: number of occurences
    N: total number of occurences

    TODO: improve with cdf
    """
    return 1. - sum([hypergeom(N, n, K).pmf(x) for x in range(X)])
Exemple #47
0
    def baseline(self):
        """Return the baseline performance vector.

        The baseline is obtaining OOT posts by chance. Thus, the baseline
        performance vector is the probability mass function of a hypergeometric
        random variable denoting the number of OOT posts in the top N list.
        The k-th element represents the probability of getting k OOT posts in
        the top N list.
        """
        rv = hypergeom(self.M, self.n, self.N)
        k = np.arange(self.min_sup, self.max_sup+1)
        return rv.pmf(k)
Exemple #48
0
    def combo_in_top(n):
        prbA = hypergeom(N, nA, n)
        prbB = hypergeom(N, nB, n)
        prbL = hypergeom(N, nLands, n)

        # cdf is probabiliyt of k or fewer successes
        # prb.cdf(0)

        p_L_eq0 = prbL.cdf(0)
        p_L_le1 = prbL.cdf(1)
        p_L_le4 = prbL.cdf(4)
        # having between 2 to 4 lands
        p_L_ge2_le4 = p_L_le4 - p_L_le1
        p_keepable = p_L_ge2_le4

        # probability of having none
        p_A_eq0 = prbA.cdf(0)
        p_B_eq0 = prbB.cdf(0)
        # probability of having at least 1
        p_A_ge1 = 1 - p_A_eq0
        p_B_ge1 = 1 - p_B_eq0
        # http://math.stackexchange.com/questions/72589/calculating-probability-of-at-least-one-event-occurring

        def p_not_any_fail(p_A_fail, p_B_fail):
            p_and = (1 - p_A_fail) + (1 - p_B_fail) - (1 - p_A_fail * p_B_fail)
            return p_and

        p_and = (1 - p_A_eq0) + (1 - p_B_eq0) - (1 - p_A_eq0 * p_B_eq0)
        p_and = p_A_eq0 * p_B_eq0 - p_A_eq0 - p_B_eq0 + 1

        p_nor = p_A_eq0 * p_B_eq0  # chance_of_neither_combo_card
        p_or = 1 - p_nor  # chance of either card
        p_and = p_A_ge1 + p_B_ge1 - p_or  # chance of both cards
        p_xor = p_or - p_and  # change of either A or B but not both

        print("p_and = %r" % (p_and,))

        p_not_any_fail(1 - p_and, 1 - p_keepable)
Exemple #49
0
def hg_p_value(n_parent,k_parent,n_child,k_child):
    """
    one-tailed hypothesis test.
    H0: The partition is random.
    """
    hg = spst.hypergeom(n_parent,k_parent,n_child)
    parent_mean = n_child*(k_parent*1.0/n_parent)
    
    if k_child <= parent_mean:
        #then we want to know what the probability is
        #that we would observe a result as extreme as this one.
        return max(0.0,hg.cdf(k_child))
    else:
        return max(1-hg.cdf(k_child-1),0.0)    
Exemple #50
0
def get_sender_pvals(addrCounts):
    M = sum([t[0] for t in addrCounts.values()])
    N = sum([t[1] for t in addrCounts.values()])
    low = []
    high = []
    for k,t in addrCounts.items():
        h = stats.hypergeom(N, M, t[1])
        low.append((h.cdf(t[0]), t[0], t[1], k))
        p = h.sf(t[0] - 1)
        if isnan(p): # old version of hypergeom.sf() gives NaN, yuck
            p = h.pmf(range(t[0], t[1] + 1)).sum()
        high.append((p, t[0], t[1], k))
    low.sort()
    high.sort()
    return low, high
def calculate_enrichment(pathway_matrix, gene_set):
    """Calculate hypergoemotric enrichment of the set for each pathway

    The pathway matrix should have pathways in rows and genes in columns
    """
    # only consider genes which are known to be in pathways
    pathway_gene_list = gene_set.intersection(pathway_matrix.columns)
    # Generate hypergeometric distributions for each pathway. Each
    # pathway needs its own because they have different lenghts
    distributions = [hypergeom(len(pathway_matrix.columns), l,
                               len(pathway_gene_list))
                     for l in pathway_matrix.sum(axis=1)]
    pathway_hits = pathway_matrix[pathway_gene_list].sum(axis=1)
    # Each p-value for the hypergeometric enrichment is
    # survival function + 0.5 * pmf
    significance = [dist.sf(x) + 0.5 * dist.pmf(x)
                    for x, dist in zip(pathway_hits, distributions)]
    return Series(significance, index=pathway_matrix.index)
Exemple #52
0
def Hypergeometric(N, n, K, tag=None):
    """
    A Hypergeometric random variate
    
    Parameters
    ----------
    N : int
        The total population size
    n : int
        The number of individuals of interest in the population
    K : int
        The number of individuals that will be chosen from the population
        
    Example
    -------
    (Taken from the wikipedia page) Assume we have an urn with two types of
    marbles, 45 black ones and 5 white ones. Standing next to the urn, you
    close your eyes and draw 10 marbles without replacement. What is the
    probability that exactly 4 of the 10 are white?
    ::
    
        >>> black = 45
        >>> white = 5
        >>> draw = 10
        
        # Now we create the distribution
        >>> h = H(black + white, white, draw)
        
        # To check the probability, in this case, we can use the underlying
        #  scipy.stats object
        >>> h.rv.pmf(4)  # What is the probability that white count = 4?
        0.0039645830580151975
        
    """
    assert (
        int(N) == N and N > 0
    ), 'Hypergeometric total population size "N" must be an integer greater than zero.'
    assert (
        int(n) == n and 0 < n <= N
    ), 'Hypergeometric interest population size "n" must be an integer greater than zero and no more than the total population size.'
    assert (
        int(K) == K and 0 < K <= N
    ), 'Hypergeometric chosen population size "K" must be an integer greater than zero and no more than the total population size.'
    return uv(ss.hypergeom(N, n, K), tag=tag)
def enrichment(dbfx,inp):
	print("*************************************************************************************************")
	print("Database :"+str(dbfx))
	print("Input file:"+str(inp))
	fout = codecs.open("Enrichment_results.csv",'w',encoding = "utf8")
	fout.write("PATHWAY_NAME\tLENGTH OF PATHWAY\tINPUT_GENESET\tOVERLAPPED_GENESET\tPValue\n")
	input = []
	glst = []
	input = set(getinput(inp))
	db = {}
	db,glst = database(dbfx)
	M = len(glst)
	for d in db.keys():
		overlap = len(input.intersection(set(db[d])))
		if overlap > 0:
			ora = hypergeom(M,len(set(db[d])),len(input))
			p = ora.pmf(overlap)
#			print(str(M)+"\t"+str(len(set(db[d])))+"\t"+len(input)+"\t"+str(overlap)+str(p)+"\n")
			fout.write(str(d)+"\t"+str(len(set(db[d])))+"\t"+str(len(input))+"\t"+str(overlap)+"\t"+str(p)+"\n")	
Exemple #54
0
def partition_htest_value(n_parent,k_parent,n_child,k_child,alpha,cache=False):
    """
    tests a partition of k_parent +1s in n_parent (+1/-1)s.
    Returns (min,max) which are endpoints of (1-alpha)% 
    confidence interval for H0 = partition is random.
    """
    if n_child == 1:
        if 1.0*k_parent/n_parent < alpha/2.0 and k_parent == 1:
            return True
        elif 1.0*k_parent/n_parent > 1-alpha/2.0 and k_parent == 0:
            return True
        else:
            return False
    
    if cache==False:
        hg = spst.hypergeom(n_parent,k_parent,n_child)
        c = hg.cdf([k_child-1,k_child])
        #okay, so c is the cdf INCLUDING k.
        #if that's greater than alpha/2, then we throw away the coeff.
        #if that's less than 1-alpha/2 we don't know. But if the cdf for one
        #less is less than 1-alpha/2, we throw away the coeff.  
        return not ((c[1] > alpha/2.0) and (c[0] < (1.0-alpha/2.0)))
    else:
        hgt = (n_parent,k_parent,n_child,alpha)
        print hgt
        if hgt in _h_test_dict:
            left,right = _h_test_dict[hgt]
            print "saved one"
        else:
            left,right = partition_htest(*hgt)
            _h_test_dict[hgt] = (left,right)
        if left == -1 and right == -1:
            return False
        elif left == -1 and k_child > right:
            return True
        elif k_child < left and right == -1:
            return True
        elif k_child < left or k_child > right:
            return True
        else:
            return False
Exemple #55
0
    def calculateByDraw(self):
        # Method to calculate and display the probabilites for the number of successful draws out of a single pool of OUTS

        # hypergeometric formula: assumes draws are just total # of draws
        rv = hypergeom(self.remainingDeckSizeSpinBox.value(), self.numberOfOutsInDeckSpinBox.value(),
                       self.numberOfDrawsSpinBox.value())
        outs = np.arange(0, self.numberOfOutsInDeckSpinBox.value() + 1)
        draws = np.arange(0, self.numberOfDrawsSpinBox.value() + 1)
        PMF = rv.pmf(outs)
        self.probabilityTable.clear()

        odds = np.empty(self.numberOfOutsInDeckSpinBox.value() + 1, dtype=float)
        surviveodds = np.empty(self.numberOfOutsInDeckSpinBox.value()+1, dtype=float)
        self.probabilityTable.setRowCount(len(outs) + 1)
        self.probabilityTable.setColumnCount(4)

        # initialize the table. This code looks terrible

        i = 0
        j = 0
        while i < self.probabilityTable.columnCount():
            while j < self.probabilityTable.rowCount():
                matrixElement = QtGui.QTableWidgetItem()
                self.probabilityTable.setItem(j, i, matrixElement)
                j += 1
            i += 1
            j = 0

        item = self.probabilityTable.item(0, 0)
        item.setText(_translate("MainWindow", "Exactly", None))
        item.setTextAlignment(QtCore.Qt.AlignCenter | QtCore.Qt.AlignVCenter)
        item = self.probabilityTable.item(0, 1)
        item.setText(_translate("MainWindow", "Probability", None))
        item.setTextAlignment(QtCore.Qt.AlignCenter | QtCore.Qt.AlignVCenter)
        item = self.probabilityTable.item(0, 2)
        item.setText(_translate("MainWindow", "At Least", None))
        item.setTextAlignment(QtCore.Qt.AlignCenter | QtCore.Qt.AlignVCenter)
        item = self.probabilityTable.item(0, 3)
        item.setText(_translate("MainWindow", "Probability", None))
        item.setTextAlignment(QtCore.Qt.AlignCenter | QtCore.Qt.AlignVCenter)

        #Loop to build the table and calculate the PMF and SF based on inputs
        if len(outs)> len(draws):
            maxSuccess = draws
        else:
            maxSuccess = outs
        for out in maxSuccess:
            # creating spaces in table
            matrixElement = QtGui.QTableWidgetItem()
            self.probabilityTable.setItem(out + 1, 0, matrixElement)
            matrixElement = QtGui.QTableWidgetItem()
            self.probabilityTable.setItem(out + 1, 1, matrixElement)

            # Calculate PMF for each out
            odds[out] = rv.pmf(out)
            surviveodds[out] = rv.sf(out)

            # populate the table
            # exact outs
            item = self.probabilityTable.item(out + 1, 0)
            item.setText(_translate("MainWindow", "{0:d}".format(out), None))
            item.setTextAlignment(QtCore.Qt.AlignRight | QtCore.Qt.AlignVCenter)
            item = self.probabilityTable.item(out + 1, 1)
            item.setText(_translate("MainWindow", "{0:3f}".format(odds[out]), None))
            item.setTextAlignment(QtCore.Qt.AlignRight | QtCore.Qt.AlignVCenter)

            # Atleast outs
            if out != 0:  # atleast zero outs is meaningless
                item = self.probabilityTable.item(out + 1, 2)
                item.setText(_translate("MainWindow", "{0:d}".format(out), None))
                item.setTextAlignment(QtCore.Qt.AlignRight | QtCore.Qt.AlignVCenter)
                item = self.probabilityTable.item(out + 1, 3)
                item.setText(_translate("MainWindow", "{0:3f}".format(surviveodds[out-1]), None))
                item.setTextAlignment(QtCore.Qt.AlignRight | QtCore.Qt.AlignVCenter)
        # create the pmf graph)
        print(surviveodds)

        print(outs)
        print("The odds of getting at least {0:d} outs is {1:3f}".format(out, odds[out]))

        #### Bar graphs if I want to add them later


        ax = self.figure.add_subplot(111)
        width = 0.3
        ax.bar(outs, PMF, width, color = 'r' )
        ax.hold(True)
        ax.bar(outs+(1.2*width+1), surviveodds, width, color = 'b')
        ax.set_xticks(outs + width)
        ax.set_xticklabels(outs)
        ax.hold(False)
        self.canvas.draw()
Exemple #56
0
import matplotlib.pyplot as plt
from scipy.stats import hypergeom, rv_discrete
import numpy as np
numargs = hypergeom.numargs
#[ M, n, N ] = [100, 10, -1]

#Display frozen pmf:

rv = hypergeom( 10, 20, 3 )
print rv.dist.b
x = np.arange( 0, np.min( rv.dist.b, 3 ) + 1 )
h = plt.plot( x, rv.pmf( x ) )
exit()
#Check accuracy of cdf and ppf:

prb = hypergeom.cdf( x, M, n, N )
h = plt.semilogy( np.abs( x - hypergeom.ppf( prb, M, n, N ) ) + 1e-20 )

#Random number generation:

R = hypergeom.rvs( M, n, N, size=100 )

#Custom made discrete distribution:

vals = [np.arange( 7 ), ( 0.1, 0.2, 0.3, 0.1, 0.1, 0.1, 0.1 )]
custm = rv_discrete( name='custm', values=vals )
h = plt.plot( vals[0], custm.pmf( vals[0] ) )

Exemple #57
0
 def sample(self, x=None):
     return hypergeom(self.M, self.X, self.m).rvs(x, random_state=self.random)
def calculate_enrichment(N=100):
    # You need to replace this with something useful
    experiment_dict = eas.experiment();
    counttop100 = {}
    countbot100 = {}
    goid_prob_top = {}
    goid_prob_bot = {}
#initialize dictionary of goids
    with open("go_info.txt", 'r') as target:
        target.readline();
        for line in target:
            lines = line.split();
            counttop100[lines[0]] = np.zeros(32) #goid counts in top 100
            countbot100[lines[0]] = np.zeros(32) #goid counts in top 100
            goid_prob_top[lines[0]] = np.zeros(32) #goid prob
            goid_prob_bot[lines[0]] = np.zeros(32) #goid prob
    #add values for goid
    for i in range(0, 33):
        sorted_genes = experiment_dict[i].sort(key=lambda tup: tup[1])
        top100 = sorted_genes[0:100]
        bot100 = sorted_genes[-100:0]
        #go hrough top 100, want list of goIds, count for top/bot100
        for j in range(0, 100):
            counttop100[ gene_to_go[ top100[j][0] ] ][i] += 1
            countbot100[ gene_to_go[ top100[j][0] ] ][i] += 1
            
        #hypergeom, what is the probability i got that many counts from top 100
        #top
        for j in counttop100:
        
            gene_per_go = len(go_to_gene[j])
            [ M, n, N] = [4767, gene_per_go, N];
            rv = scistat.hypergeom(M, n, N)
            x = np.arange(0, counttop100[j][i] + 1)
            survival_exp = rv.sf(x)
            goid_prob_top[j][i] = survival_exp

        for j in countbot100:
            gene_per_go = len(go_to_gene[j])
            [ M, n, N] = [4767, gene_per_go, N];
            rv = scistat.hypergeom(M, n, N)
            x = np.arange(0, countbot100[j][i] + 1)
            survival_exp = rv.sf(x)
            goid_prob_bot[j][i] = survival_exp

#for j in experiment_dict[i]:
            
        #    mainstuff[ gene_to_go[j[0]] ][i] += j[1]
#sort by exp values
#        mainstuff.sort(key=lambda tup: tup[1])
#take top 100
            

    heatmaptop = plt.pcolor(goid_prob_top);
    heatmapbot = plt.pcolor(goid_prob_bot);
    plt.show()

    positive_enrichment_scores = goid_prob_top;
    negative_enrichment_scores = goid_prob_bot;

    return positive_enrichment_scores,negative_enrichment_scores
    clu_index = 0
    for clu in clusters:
        clu_index += 1
        if clu_index > 100: break
        clu_set = clu

        clu_sig_score = 0
        clu_sig_smallest = -1;
        for bic in biclusters:
            bic_size = len(bic)
            clu_size = len(clu_set)
            overlap = len(bic & clu_set)


            hyper = hypergeom(OPR_COUNT, bic_size, clu_size)
            hypersf = hyper.sf(overlap)
            #print("%d\t%d\t%d\t%.3f" % (bic_size, clu_size, overlap, hypersf))
            if clu_sig_smallest < 0:
                clu_sig_smallest = hypersf
            elif clu_sig_smallest > hypersf:
                clu_sig_smallest = hypersf


            if hypersf < P_CUTOFF:
                if hypersf <= 0: hypersf = P_CUTOFF
                clu_sig_score += -math.log10(hypersf)

        clu_sig_score_p = 0
        clu_sig_smallest_p = -1;
        for bic in biclusters_p:
    print("cluster\tregulon\tclu_size\treg_size\toverlap\toverlap_coe(wiki)\tcoe2\tp-value(hypergeom)")
    for index,clu in enumerate(clusters):
        if len(clu) > 1:
            for reg_name in regulon.keys():
                reg = regulon[reg_name]
                clu_size = len(clu)
                reg_size = len(reg)
                overlap  = len(clu & reg)
                union    = len(clu | reg)

                coe1 = overlap_coe1(overlap, clu_size, reg_size)
                coe2 = overlap/float(union)

                if coe1 < 0.1 or coe2 < 0.1: continue

                rv = hypergeom(OPERON_COUNT, reg_size, clu_size)

                print("%s\t%s\t%i\t%i\t%i\t%.3f\t%.3f\t%g" %
                        (index + 1,
                            reg_name,
                            clu_size,
                            reg_size,
                            overlap,
                            coe1,
                            coe2,
                            rv.sf(overlap)))