Ejemplo n.º 1
0
def valid_covalue(data, parts):
    count = 0
    total = 0
    for i in range(0, len(parts) - 1):
        for j in range(i + 1, len(parts)):
            for partone in range(0, len(parts[i])):
                for parttwo in range(0, len(parts[j])):
                    left_i = parts[i][partone]
                    right_i = parts[j][parttwo]
                    left = data[left_i].astype(bool)
                    right = data[right_i].astype(bool)
                    k = np.count_nonzero(np.bitwise_and(left, right))
                    prb = hypergeom.cdf(k, len(left), np.count_nonzero(left),
                                        np.count_nonzero(right))
                    if 1 - prb < 0.05:
                        count += 1
                    total += 1
    print 'false positive %d' % ((1.0 * count) / total)

    count = 0
    total = 0
    for i in range(0, len(parts)):
        for k in range(0, len(parts[i])):
            for j in range(k, len(parts[i])):
                left_i = parts[i][k]
                right_i = parts[i][j]
                left = data[left_i].astype(bool)
                right = data[right_i].astype(bool)
                k = np.count_nonzero(np.bitwise_and(left, right))
                prb = hypergeom.cdf(k, len(left), np.count_nonzero(left),
                                    np.count_nonzero(right))
                if 1 - prb > 0.05:
                    count += 1
                total += 1
    print 'true positive %d' % (1.0 * count / total)
Ejemplo n.º 2
0
def get_dic_pvalue(dic_fasta, dic_control, ctrl_all, fasta_all):
    """
    Calculate the p-value with an hypergeomtric test of the count
    of the hexanucleotide, di-nucleotide, codon, codon position nucleotides,
    amino acids, or others in a control set of exons and the exons in the fasta file

    :param dic_fasta:(dictionary of int) the count of the hexanucleotide, di-nucleotide,
    codon, codon position nucleotides, amino acids, or others in the fasta file given by the user
    :param dic_control:(dictionary of int) the count of the hexanucleotide, di-nucleotide,
    codon, codon position nucleotides, amino acids, or others in the control exons (exons CCE/ALL/ACE of fasterDB)
    :param ctrl_all: (int) the total number of the hexanucleotide, di-nucleotide,
    codon, codon position nucleotides, amino acids, or others in the control set of exons
    :param fasta_all:(int) the total number of the hexanucleotide, di-nucleotide,
    codon, codon position nucleotides, amino acids, or others in the fasta file
    :return: (dictionary of float) for each element give it's calculated p-value
    """
    p_val = {}
    for key in dic_fasta.keys():
        p1 = hypergeom.cdf(dic_fasta[key], ctrl_all, dic_control[key], fasta_all)
        p2 = 1 - hypergeom.cdf(dic_fasta[key], ctrl_all, dic_control[key], fasta_all)
        if p1 < p2:
            p_val[key] = p1
        else:
            p_val[key] = p2
    return p_val
Ejemplo n.º 3
0
def test_hypergeometric():
    assert_almost_equal(hypergeometric(4, 10, 5, 6, 'greater'),
                        1 - hypergeom.cdf(3, 10, 5, 6))
    assert_almost_equal(hypergeometric(4, 10, 5, 6, 'less'),
                        hypergeom.cdf(4, 10, 5, 6))
    assert_almost_equal(hypergeometric(4, 10, 5, 6, 'two-sided'),
                        2 * (1 - hypergeom.cdf(3, 10, 5, 6)))
Ejemplo n.º 4
0
def test_hypergeometric():
    """Clopper-Pearson."""
    np.testing.assert_almost_equal(hypergeometric(4, 10, 5, 6, 'greater'),
                                   1 - hypergeom.cdf(3, 10, 5, 6))
    np.testing.assert_almost_equal(hypergeometric(4, 10, 5, 6, 'less'),
                                   hypergeom.cdf(4, 10, 5, 6))
    np.testing.assert_almost_equal(hypergeometric(4, 10, 5, 6, 'two-sided'),
                                   2 * (1 - hypergeom.cdf(3, 10, 5, 6)))
Ejemplo n.º 5
0
def hypergeom_conf_interval(n, x, N, cl=0.975, alternative="two-sided", G=None,
                        **kwargs):
    """
    Confidence interval for a hypergeometric distribution parameter G, the number of good 
    objects in a population in size N, based on the number x of good objects in a simple
    random sample of size n.

    Parameters
    ----------
    n : int
        The number of draws without replacement.
    x : int
        The number of "good" objects in the sample.
    N : int
        The number of objects in the population.
    cl : float in (0, 1)
        The desired confidence level.
    alternative : {"two-sided", "lower", "upper"}
        Indicates the alternative hypothesis.
    G : int in [0, N]
        Starting point in search for confidence bounds for the hypergeometric parameter G.
    kwargs : dict
        Key word arguments

    Returns
    -------
    tuple
        lower and upper confidence level with coverage (at least)
        1-alpha.

    Notes
    -----
    xtol : float
        Tolerance
    rtol : float
        Tolerance
    maxiter : int
        Maximum number of iterations.
    """
    assert alternative in ("two-sided", "lower", "upper")

    if G is None:
        G = (x / n)*N
    ci_low = 0
    ci_upp = N

    if alternative == 'two-sided':
        cl = 1 - (1-cl)/2

    if alternative != "upper" and x > 0:
        f = lambda q: cl - hypergeom.cdf(x-1, N, q, n)
        ci_low = math.ceil(brentq(f, 0.0, G, *kwargs))

    if alternative != "lower" and x < n:
        f = lambda q: hypergeom.cdf(x, N, q, n) - (1-cl)
        ci_upp = math.floor(brentq(f, G, N, *kwargs))

    return ci_low, ci_upp
Ejemplo n.º 6
0
def hypergeom_conf_interval(n, x, N, cl=0.975, alternative="two-sided", G=None,
                            **kwargs):
    """
    Confidence interval for a hypergeometric distribution parameter G, the number of good
    objects in a population in size N, based on the number x of good objects in a simple
    random sample of size n.

    Parameters
    ----------
    n : int
        The number of draws without replacement.
    x : int
        The number of "good" objects in the sample.
    N : int
        The number of objects in the population.
    cl : float in (0, 1)
        The desired confidence level.
    alternative : {"two-sided", "lower", "upper"}
        Indicates the alternative hypothesis.
    G : int in [0, N]
        Starting point in search for confidence bounds for the hypergeometric parameter G.
    kwargs : dict
        Key word arguments

    Returns
    -------
    tuple
        lower and upper confidence level with coverage (at least)
        1-alpha.

    Notes
    -----
    xtol : float
        Tolerance
    rtol : float
        Tolerance
    maxiter : int
        Maximum number of iterations.
    """
    assert alternative in ("two-sided", "lower", "upper")

    if G is None:
        G = (x / n) * N
    ci_low = 0
    ci_upp = N

    if alternative == 'two-sided':
        cl = 1 - (1 - cl) / 2

    if alternative != "upper" and x > 0:
        f = lambda q: cl - hypergeom.cdf(x - 1, N, q, n)
        ci_low = math.ceil(brentq(f, 0.0, G, *kwargs))

    if alternative != "lower" and x < n:
        f = lambda q: hypergeom.cdf(x, N, q, n) - (1 - cl)
        ci_upp = math.floor(brentq(f, G, N, *kwargs))

    return ci_low, ci_upp
Ejemplo n.º 7
0
    def score(self, x_t: Union[np.ndarray, Any]) -> np.ndarray:
        """
        Compute the test-statistic (FET) between the reference window(s) and test window.
        If a given test-window is not yet full then a test-statistic of np.nan is returned for that window.

        Parameters
        ----------
        x_t
            A single instance.

        Returns
        -------
        Estimated FET test statistics (1-p_val) between reference window and test windows.
        """
        values = set(np.unique(x_t))
        if not set(values).issubset(['0', '1', True, False]):
            raise ValueError(
                "The `x_t` data must consist of only (0,1)'s or (False,True)'s for the "
                "FETDriftOnline detector.")

        x_t = super()._preprocess_xt(x_t)
        self._update_state(x_t)

        stats = np.zeros((len(self.window_sizes), self.n_features),
                         dtype=np.float32)
        for k, ws in enumerate(self.window_sizes):
            if self.t >= ws:
                sum_last_ws = np.sum(self.xs[-ws:, :], axis=0)

                # Perform FET with hypergeom.cdf (this is vectorised over features)
                if self.alternative == 'greater':
                    p_vals = hypergeom.cdf(self.sum_ref, self.n + ws,
                                           self.sum_ref + sum_last_ws, self.n)
                else:
                    p_vals = hypergeom.cdf(sum_last_ws, self.n + ws,
                                           self.sum_ref + sum_last_ws, ws)

                # Compute test stat and apply smoothing
                stats_k = 1 - p_vals
                for f in range(self.n_features):
                    if len(self.test_stats) != 0 and not np.isnan(
                            self.test_stats[-1, k, f]):
                        stats_k[f] = (1 - self.lam) * self.test_stats[
                            -1, k, f] + self.lam * stats_k[f]
                stats[k, :] = stats_k
            else:
                stats[k, :] = np.nan
        return stats
Ejemplo n.º 8
0
def enrichment(geneList1, geneList2, allGenes):
    tmp = set(geneList1).intersection(geneList2)
    x = len(tmp)
    M = allGenes
    n = len(geneList1)
    N = len(geneList2)
    return [x,tmp,M,n,N,1-hypergeom.cdf(x, M, n, N)]
Ejemplo n.º 9
0
def _TEST_(block, exptotal, bkgtotal, pseudo_count):
    sigPoi = 0
    sigHyp = 0
    ind = block.index
    for i in block.index:
        lamda = block.loc[i, 'BkgHop_withBonus']
        obs = block.loc[i, 'ExpHop_withBonus']
        BkgHop = block.loc[i, 'BkgHop']
        ExpHop = block.loc[i, 'ExpHop']
        P_poisson = 1 - poisson.cdf(int(obs) - 1, lamda + pseudo_count)
        if int(BkgHop) < 100000000 and int(ExpHop) < 100000000:
            P_hyper = 1 - hypergeom.cdf(ExpHop - 1,
                                        (bkgtotal + exptotal), exptotal,
                                        (ExpHop + BkgHop))
        else:
            P_hyper = '***'
        block.loc[i, 'BkgFraction'] = float(BkgHop / bkgtotal)
        block.loc[i, 'ExpFraction'] = float(ExpHop / exptotal)
        block.loc[i, 'P_Hyper'] = P_hyper
        block.loc[i, 'P_Poisson'] = P_poisson
        if P_poisson < alpha and lamda * obs != 0:
            sigPoi = sigPoi + 1
        if P_hyper < alpha:
            sigHyp = sigHyp + 1
    return block, sigPoi, sigHyp
Ejemplo n.º 10
0
def hypergeom_test(data, sort_fdr=True):
    p_value_list = []
    ratio_in_study_list = []
    ratio_in_pop_list = []
    classes = []
    hit_genes = []
    hit_links = []
    path_names, typeIIs, typeIs = list(), list(), list()
    for study_hitnumber, pop_number, pop_hitnumber, study_number, each_class, associated_diff_info, link, path_name, typeII, typeI in data:
        p_value = 1 - hypergeom.cdf(study_hitnumber - 1, pop_number, pop_hitnumber, study_number)
        ratio_in_study = str(study_hitnumber) + '/' + str(study_number)
        ratio_in_pop = str(pop_hitnumber) + '/' + str(pop_number)
        p_value_list.append(p_value)
        ratio_in_study_list.append(ratio_in_study)
        ratio_in_pop_list.append(ratio_in_pop)
        classes.append(each_class)
        hit_genes.append(associated_diff_info)
        hit_links.append(link)
        path_names.append(path_name)
        typeIIs.append(typeII)
        typeIs.append(typeI)

    q_value_list = multipletests(p_value_list, method='fdr_bh')[1]
    number = len(q_value_list)
    # print(number)
    databases = ['KEGG PATHWAY'] * number
    result = zip(path_names, databases, classes, ratio_in_study_list, ratio_in_pop_list, p_value_list, q_value_list,
                 hit_genes, hit_links, typeIIs, typeIs)
    if sort_fdr:
        sorted_result = sorted(result, key=lambda x: (x[6], x[5]))
    else:
        sorted_result = sorted(result, key=lambda x: (x[5], x[6]))
    # num = sum([1 for x in p_value_list if x < 1])
    # return sorted_result[0:num]
    return sorted_result
Ejemplo n.º 11
0
 def co_test_single(self, i, j):
     left = self.data[i].astype(bool)
     right = self.data[j].astype(bool)
     k = np.count_nonzero(np.bitwise_and(left, right))
     prb = hypergeom.cdf(k, len(left), np.count_nonzero(left),
                         np.count_nonzero(right))
     return 1 - prb
Ejemplo n.º 12
0
 def test_hypergeom_cdf_lower(self):
     # check hypergeom cdf to return right value
     from scipy.stats import hypergeom
     h = HyperCI(n_pop=100, n_draw=20, k_s_obs=5)
     k_s = 30
     res = hypergeom.cdf(h.k_s_obs, h.n_pop, k_s, h.n_draw)
     self.assertEqual(res, 0.4009887932548518)
Ejemplo n.º 13
0
    def run_analysis(self,
                     pvalue_cutoff: float = 0.05,
                     method: str = "hyperg",
                     limiter: int = 0) -> pd.DataFrame:

        results = []

        population = self.pathway_data["population"]

        num_cpds = len(self.compound_list)

        for pathway in self.pathway_data["pathways"]:

            pth_info = self.pathway_data["pathways"][pathway]
            pth_name = pth_info["name"]

            dbids = list(pth_info["compounds"].keys())
            pth_cpds = [pth_info["compounds"][x] for x in dbids]

            num_dbids = len(dbids)

            if num_dbids >= limiter:
                pathway_hits = self._check_if_in_pathway(pth_cpds)
                num_hits = len(pathway_hits)

                if method == "hyperg":
                    p_value = 1 - hypergeom.cdf(num_hits - 1, population -
                                                num_dbids, num_cpds, num_dbids)
                else:
                    _, p_value = fisher_exact(
                        [[num_hits, num_cpds - num_hits],
                         [
                             num_dbids - num_hits,
                             ((population - num_dbids) - num_cpds) + num_hits
                         ]])

                in_pathway_str = self._generate_string(pathway_hits, dbids)
                importance = self._calc_cov(num_hits, num_dbids)
                results.append([
                    pathway, pth_name, num_hits, num_dbids, p_value,
                    importance, in_pathway_str
                ])

        results = pd.DataFrame(results,
                               columns=[
                                   "Pathway ID", "Pathway Name", "Hits",
                                   "Pathway Compounds", "p", "Coverage",
                                   "Identifiers"
                               ])

        results.set_index("Pathway ID", inplace=True)

        _, holm_p, _, _ = multipletests(results["p"].values, method="holm")
        results.insert(4, "Holm p", holm_p)

        results = results[results["Holm p"] <= pvalue_cutoff]

        results.sort_values("p", inplace=True)

        self.results = results
def Random_strongInteraction(part1, part2, cluster_pool1, cluster_pool2):
    global min_interaction, p_value
    ''' This is for counputing FDR using random permutation '''
    c_interaction = {}
    for i in range(len(part1)):
        region1 = str(part1[i])
        region2 = str(part2[i])
        inter = "%d--%d" % (part1[i].cluster, part2[i].cluster)
        if c_interaction.has_key(inter):
            c_interaction[inter] += 1
        else:
            c_interaction[inter] = 0

    k = 0  # record for strong interactions
    n = 0
    for interaction in c_interaction:
        n = n + 1
        count = c_interaction[interaction]
        if count < min_interaction: continue
        i = int(interaction.split("--")[0])
        j = int(interaction.split("--")[1])
        try:  # we select clusters with size no less than 5, so some interactions cannot be found in clusters
            count1 = cluster_pool1[i].cluster
            count2 = cluster_pool2[j].cluster
        except:
            continue
        real_p = 1 - hypergeom.cdf(count, len(part1), count1, count2)
        if real_p <= p_value:
            k = k + 1
    return [n, k]
 def hyperConfidence(db, antc, consq):
     total = db.shape[0]
     cxy = raMetricas.abSupp(db, antc, consq)
     cx = raMetricas.abSupp(db, antc)
     cy = raMetricas.abSupp(db, consq)
     result = hypergeom.cdf(k=cxy - 1, M=total, n=cy, N=cx)
     return result
Ejemplo n.º 16
0
def calc_hyp(node_list, cui_to_genes, N, Q):
    n = len(node_list)
    (assoc_count, assoc_genes) = get_assoc(node_list)

    assoc_analy = []
    for (a, k) in assoc_count.items():
        K = len(cui_to_genes[a])
        prb = 1 - hypergeom.cdf(k, N, K, n)
        assoc_analy.append([a, k, K, prb])
    # Q = 0.001
    sort_assoc = sorted(assoc_analy, key=lambda x: (x[3], x[0]))
    m = len(sort_assoc)
    mhc_assoc = []
    for (i, [a, k, K, prb]) in enumerate(sort_assoc):
        BH = (float(i + 1) /
              m) * Q  # calculate Benjamini-Hochberg based on ranked data
        mhc_assoc.append([i + 1, a, k, K, prb, BH])
    sig_assoc = []
    for [rank, phen, assnet, assint, prb, BH] in mhc_assoc:
        if prb < BH and assint > 24:
            genes = sorted(assoc_genes[phen])
            gene_str = ','.join(genes)
            phen_term = cui_to_phens[phen][
                0]  # use the first phenotype as the descriptor
            sig_assoc.append(
                [rank, phen_term, phen, assnet, assint, prb, BH, gene_str])
        elif prb > BH:
            break
    return sig_assoc
Ejemplo n.º 17
0
def Random_strongInteraction(part1,part2,cluster_pool1,cluster_pool2):
    global min_interaction, p_value
    ''' This is for counputing FDR using random permutation '''
    c_interaction={}
    for i in range(len(part1)):
        region1=str(part1[i])
        region2=str(part2[i])
        inter="%d--%d"%(part1[i].cluster,part2[i].cluster)
        if c_interaction.has_key(inter):
            c_interaction[inter]+=1
        else:
            c_interaction[inter]=0

    k=0 # record for strong interactions
    n=0
    for interaction in c_interaction:
        n=n+1
        count=c_interaction[interaction]
        if count<min_interaction: continue
        i=int(interaction.split("--")[0])
        j=int(interaction.split("--")[1])
        try:  # we select clusters with size no less than 5, so some interactions cannot be found in clusters
            count1=cluster_pool1[i].cluster
            count2=cluster_pool2[j].cluster
        except:
            continue
        real_p=1-hypergeom.cdf(count,len(part1),count1,count2)
        if real_p<=p_value:
            k=k+1
    return [n,k]
Ejemplo n.º 18
0
def zeta(L, s, Nmutated, a):
    # zeta(L,s,Nmut,a)=P[H(L+Nmut,L-Nmut,s) >= a]
    # i.e. the tail of a hypergeometric distribution
    # in the manuscript, zeta(L,s,n,a) is F_n(a) for a given L and s
    #
    # see
    #   https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.hypergeom.html#scipy.stats.hypergeom
    #
    # hypergeom.cdf(k,M,n,N)
    #   k = number of red balls drawn = a-1 (not a)
    #   M = total number of balls     = L+Nmutated
    #   n = number of red balls       = L-Nmutated
    #   N = number of draws           = s

    #if (showZetaCalls):
    #	callStr = "zeta(%s,%s,%s,%s) = 1-hypergeom.cdf(%s,%s,%s,%s) = %.12f" \
    #	        % (L,s,Nmutated,a,
    #	           a-1,L+Nmutated,L-Nmutated,s,
    #               1 - hypergeom.cdf(a-1,L+Nmutated,L-Nmutated,s))
    #	cacheKey = (L,s,Nmutated,a)
    #	if (cacheKey in zeta_cache): callStr += " (from cache)"
    #	print(callStr,file=stderr)

    if (useCache):
        cacheKey = (L, s, Nmutated, a)
        if (cacheKey in zeta_cache):
            return zeta_cache[cacheKey]

    p = 1 - hypergeom.cdf(a - 1, L + Nmutated, L - Nmutated, s)

    if (useCache):
        zeta_cache[cacheKey] = p
    return p
Ejemplo n.º 19
0
def pval(R, n_basepairs, q_size):
    t = R[:,0]
    pvals = np.zeros(len(t))
    for i in range(0, len(t)):
        pvals[i] = hypergeom.cdf(t[i], n_basepairs, q_size, R[i,2])

    return  1 - pvals
Ejemplo n.º 20
0
def specificities(freqMotifParDep):
    """
        Calcule la spécificité du motif dans chaque département
            - entrée : dataframe contenant la fréquence du motif recherché par département
            - sortie : dictionnaire contenant pour chaque département la spécificité du motif
    """

    freqTot = 31868064
    freqTotParDep = pd.read_hdf('./static/freqByDep.hdf', 'freqTokensByDep')
    freqTotMotif = freqMotifParDep.sum().sum()
    df_freqTotMotif = pd.DataFrame(freqMotifParDep.sum(axis=1), columns=["0"])

    # Calcul de la fréquence attendue du motif dans chaque département
    expectedCounts = df_freqTotMotif.dot(freqTotParDep) / freqTot
    specif = freqMotifParDep.copy()
    """
        Pour chaque département, la spécificité du motif est calculée à partir de :
            - la fréquence du motif dans le département en question (à partir de freqMotifParDep)
            - la fréquence totale de tous les tokens (freqTot)
            - la fréquence totale du motif (freqTotMotif)
            - la fréquence totale de tous les tokens dans le département (à partir de freqTotParDep)
    """
    for dep in freqMotifParDep.columns:
        if (freqMotifParDep.loc["freq", dep] < expectedCounts.loc["freq",
                                                                  dep]):
            specif.loc["freq",
                       dep] = hypergeom.cdf(freqMotifParDep.loc["freq", dep],
                                            freqTot, freqTotMotif,
                                            freqTotParDep.transpose().loc[dep])
        else:
            specif.loc["freq", dep] = 1 - hypergeom.cdf(
                freqMotifParDep.loc["freq", dep] - 1, freqTot, freqTotMotif,
                freqTotParDep.transpose().loc[dep])

    specif = np.log10(specif)
    specif[freqMotifParDep >=
           expectedCounts] = -specif[freqMotifParDep >= expectedCounts]

    # Les valeurs qui ne sont pas entre -10 et 10 sont tronquées
    for dep in specif:
        specif.loc[specif[dep] > 10, dep] = 10
        specif.loc[specif[dep] < -10, dep] = -10

    specif.rename(index={"freq": "specif"}, inplace=True)
    specif = pd.DataFrame.to_dict(specif)

    return specif
Ejemplo n.º 21
0
 def cHgPvl(x,M,n,N):
     """
     x=randVar
     M=popSize
     n=totalSuccesses
     N=samplSize
     """
     return 1-hypergeom.cdf(x,M,n,N)+hypergeom.pmf(x,M,n,N)
Ejemplo n.º 22
0
    def _hypergeom_wrapper(self, x):

        from scipy.stats import hypergeom
        p = hypergeom.cdf(x['lonely triplets at pos'],
                          x['Num Triplets at Gene'],
                          x['lonely triplets at gen'],
                          x['Num Triplets at Pos'])
        return p
def compHyperGemP(hits, bg_all, bg_hits, query_size):
    '''
    Returns P(X>=occ);fold-enrichment
    '''
    p = 1 - hypergeom.cdf(hits - 1, bg_all, bg_hits,
                          query_size)  # calculates P(X>=occ)
    fe = hits / ((bg_hits / bg_all) * query_size)
    return p, fe
Ejemplo n.º 24
0
def GO_enrichment(geneList, ontology, expressedGenes = None, printIt=False, pCut = 1000000, xRef = {}):

    lenAllGenes, lenTheseGenes =  len(expressedGenes), len(geneList)
    pValues = defaultdict()
    nCmps = 0

    for GOTerm, GOGenes in ontology.items():
        inBoth = GOGenes['genes'].intersection(geneList)
        expressedGOGenes = GOGenes['genes'].intersection(expressedGenes)
        if len(inBoth) <= 3 or len(expressedGOGenes) < 5:
            pValues[GOTerm] = 'notest'
            continue
            
        pVal = (1.-hypergeom.cdf(len(inBoth), lenAllGenes, len(expressedGOGenes), lenTheseGenes))
        if pVal < 0:
            pVal = 0 
        symbols = []
        for ensg in inBoth:
            if ensg in xRef:
                symbols.append(xRef[ensg])
            else:
                symbols.append(ensg)
        pValues[GOTerm] = (pVal, len(inBoth), len(expressedGOGenes), len(GOGenes['genes']), inBoth, symbols)
        
    for k, v in pValues.items():
        try:
            pValues[k][0] = v * float(nCmps) #bonferroni correction
        except:
            pass
    import operator
    y  = []

    sorted_x = sorted(pValues.iteritems(), key=operator.itemgetter(1))

    for k, v in sorted_x:
        if v == "notest":
            continue
        if not type(k) == str:
            continue
        try:
            if v[0] > pCut:
                continue
            if printIt:
                [k, "|".join(ontology[k]['name']), v[0], v[1], v[2], v[3], ",".join(v[4]),  ",".join(v[5])]
                #print k, "|".join(ontology[k]['name']), "%.3e" %v[0], v[1], v[2], v[3], "|".join(v[3])
            y.append([k, "|".join(ontology[k]['name']), v[0], v[1], v[2], v[3], ",".join(v[4]), ",".join(v[5])])
            
        except:
            pass

    try:
        df = pd.DataFrame(y, columns=['GO Term ID', 'GO Term Description', 'Bonferroni-corrected Hypergeometric p-Value', 'N Genes in List and GO Category', 'N Expressed Genes in GO Category', 'N Genes in GO category', 'Ensembl Gene IDs in List', 'Gene symbols in List'])
        df.set_index('GO Term ID', inplace=True)
    except:
        df = pd.DataFrame(None, columns=['GO Term ID', 'GO Term Description', 'Bonferroni-corrected Hypergeometric p-Value', 'N Genes in List and GO Category', 'N Expressed Genes in GO Category', 'N Genes in GO category', 'Ensembl Gene IDs in List', 'Gene symbols in List'])

    return df
Ejemplo n.º 25
0
def get_pa(N, n, c, p):
    """
    Get the probability of acceptance
    N = lot size
    n = sample size
    c = tolerable defective rate
    p = defective rate
    """
    return hypergeom.cdf(c, N, N * p, n)
def summary_to_hypergeometric_pvals(summary):
    #sys.stderr.write("C1\n")
    hypergeometric_pvals = {}
    for dbk in dbs.keys():
        #if dbk != 'jaspar':
        #    continue
        for p in paddings:
            # reduce padding-specific tuples in summary[k][p] to find TF indices where there are overlaps
            tf_overlaps = {}
            ordered_tfs = summary[dbk]['ordered_tfs']
            mapped_tfs = summary[dbk]['mapped_tfs']
            totals = summary[dbk]['totals']
            tups = summary[dbk][str(p)]
            for tup in tups:
                (probe, bitstring) = tup
                bits = list(bitstring)
                for idx, bit in enumerate(bits):
                    if bit == '1':
                        if ordered_tfs[idx] not in tf_overlaps:
                            tf_overlaps[ordered_tfs[idx]] = 0
                        tf_overlaps[ordered_tfs[idx]] += 1
            #print(tf_overlaps)

            # use these indices to count TF-overlap totals over set of probes, and then over all probes via summary[k]['totals']
            all_overlaps = {}
            for tfk in tf_overlaps.keys():
                idx = mapped_tfs[tfk]
                all_overlaps[tfk] = totals[idx]
            #print(all_overlaps)

            # for each TF, calculate a hypergeometric p-value
            for tfk in tf_overlaps.keys():
                # number of probes ("balls") in probe-subset ("sample") that overlap TF (sample of "red balls")
                x = int(tf_overlaps[tfk])
                # number of probes ("balls") in set ("urn") ("all balls, black and red")
                M = int(total_number_of_probes)
                # number of probes in set ("urn") that overlap TF (all "red balls")
                n = int(all_overlaps[tfk])
                # number of probes in probe-subset ("sample")
                N = int(len(probes))
                #print('\t'.join([tfk, str(x), str(M), str(n), str(N)]))
                pval = 1.0 - hypergeom.cdf(x, M, n, N)
                if pval == 0.0:
                    pval = 2.2250738585072014e-308
                ip = str(p)
                if dbk not in hypergeometric_pvals:
                    hypergeometric_pvals[dbk] = {}
                    if ip not in hypergeometric_pvals[dbk]:
                        hypergeometric_pvals[dbk][ip] = []
                hypergeometric_pvals[dbk][ip].append({
                    'id': tfk,
                    'score': pval
                })

    #sys.stderr.write("C2\n")
    return hypergeometric_pvals
Ejemplo n.º 27
0
 def diff_of_tail_area_and_cl(self, k_s_x, lf='left'):
     """
     Calculate left/right tail probability minus (confidence level)*0.5 for hypergeometric distribution.
     :param k_s_x: number of success in the population
     :param lf: left tail or right tail
     :return:
     when lf='left' it returns sum of density among [0:k_s_t+1] minus (confidence level) *0.5
     when lf='right' it returns sum of density [k_s_t:n_pop+1] minus (confidence level) *0.5
     """
     if lf == 'left':  # left tailを計算
         return abs(
             hypergeom.cdf(self.k_s_obs, self.n_pop, k_s_x, self.n_draw) -
             self.cl * 0.5)
     elif lf == 'right':  # right tailを計算
         return abs(1 - hypergeom.cdf(self.k_s_obs -
                                      1, self.n_pop, k_s_x, self.n_draw) -
                    self.cl * 0.5)
     else:
         raise TypeError('lf must be "left" or "right"')
Ejemplo n.º 28
0
def hypergeom_function(white_balls_drawn, population, white_balls_in_population, total_balls_drawn):
	"""
	hypergeometric function for probability value 
	@param white_balls_drawn -- associated gene in input genesubset
	@param population -- population (here use the offical home sapiens genes in NCBI)
	@param white_balls_in_population -- assoicated genes in population
	@param total_balls_drawn -- input genelist size
	"""
	prob = 1-hypergeom.cdf(white_balls_drawn-1, population, white_balls_in_population, total_balls_drawn)
	return prob	
Ejemplo n.º 29
0
    def _hypergeom_wrapper(self, x):

        from scipy.stats import hypergeom

        p = hypergeom.cdf(
            x["lonely triplets at pos"],
            x["Num Triplets at Gene"],
            x["lonely triplets at gen"],
            x["Num Triplets at Pos"],
        )
        return p
Ejemplo n.º 30
0
def calculate_enrichment(genes_file,
                         peaks_file,
                         clusters,
                         n_genes,
                         working_dir,
                         distance=None,
                         report_entire_feature=False,
                         bedtools_exe="bedtools"):
    """
    Calculate enrichment for a single peak set and distance

    genes_file (str): path to BED file with all genes
    distance (int): distance to calculate enrichments at
    peaks_file (list): BED file containing the ChIP-seq peaks
    clusters (list): cluster files
    n_genes (int): total number of genes in the genes BED file
    report_entire_feature (bool): if True then run intersectBed
    with the -wa option (to report the entire feature, not just
    the overlap)
    bedtools_exe (str): 'bedtools' executable to use

    Returns tuple (pvalue,counts) i.e. col1 for p-val, col2 for
    number of genes)
    """
    # Initialise result arrays
    pvalues = np.zeros([len(clusters)])
    counts = np.zeros([len(clusters)])
    # Get set of genes overlapping this peak set for this distance
    overlap_genome = get_overlapping_genes(
        genes_file,
        peaks_file,
        distance,
        working_dir=working_dir,
        report_entire_feature=report_entire_feature)
    # Find subsets of overlapping genes in each RNA-seq cluster
    # and calculate enrichments
    for i, cluster_file in enumerate(clusters):
        # Read cluster file
        genes_cls = set(
            np.loadtxt(cluster_file,
                       delimiter='\t',
                       ndmin=1,
                       usecols=[0],
                       dtype=np.str))
        # No. of genes in current cluster (sample size)
        n = len(genes_cls)
        # Total number of overlapping genes (for set of all genes)
        K_i = len(overlap_genome)
        # Genes from the input regions based set, which are also in this cluster
        n_i = len(overlap_genome.intersection(genes_cls))
        # Calculate and store enrichment from hypergeometric function
        pvalues[i] = max(MIN_PVALUE, 1.0 - hg.cdf(n_i - 1, n_genes, n, K_i))
        counts[i] = n_i
    return (pvalues, counts)
Ejemplo n.º 31
0
    def _one_fit(self):
        print("\nCreating downsampled doublets...")
        self._createDoublets()

        # Normalize combined augmented set
        print("Normalizing...")
        aug_counts = self.normalizer(
            np.append(self._raw_counts, self._raw_synthetics, axis=0))
        self._norm_counts = aug_counts[:self._num_cells]
        self._synthetics = aug_counts[self._num_cells:]

        print("Running PCA...")
        # Get phenograph results
        pca = PCA(n_components=self.n_components)
        print("Clustering augmented data set with Phenograph...\n")
        reduced_counts = pca.fit_transform(aug_counts)
        fullcommunities, _, _ = phenograph.cluster(
            reduced_counts, **self.phenograph_parameters)
        min_ID = min(fullcommunities)
        self.communities_ = fullcommunities[:self._num_cells]
        self.synth_communities_ = fullcommunities[self._num_cells:]
        community_sizes = [
            np.count_nonzero(fullcommunities == i)
            for i in np.unique(fullcommunities)
        ]
        print("Found communities [{0}, ... {2}], with sizes: {1}\n".format(
            min(fullcommunities), community_sizes, max(fullcommunities)))

        # Count number of fake doublets in each community and assign score
        # Number of synth/orig cells in each cluster.
        synth_cells_per_comm = collections.Counter(self.synth_communities_)
        orig_cells_per_comm = collections.Counter(self.communities_)
        community_IDs = orig_cells_per_comm.keys()
        community_scores = {
            i: float(synth_cells_per_comm[i]) /
            (synth_cells_per_comm[i] + orig_cells_per_comm[i])
            for i in community_IDs
        }
        scores = np.array([community_scores[i] for i in self.communities_])

        community_p_values = {
            i: hypergeom.cdf(synth_cells_per_comm[i], aug_counts.shape[0],
                             self._synthetics.shape[0],
                             synth_cells_per_comm[i] + orig_cells_per_comm[i])
            for i in community_IDs
        }
        p_values = np.array([community_p_values[i] for i in self.communities_])

        if min_ID < 0:
            scores[self.communities_ == -1] = np.nan
            p_values[self.communities_ == -1] = np.nan

        return scores, p_values
Ejemplo n.º 32
0
    def hypergeom(self, white_balls_drawn, population,
                  white_balls_in_population, total_balls_drawn):
        """
		hypergeometric function for probability value 
    	@param white_balls_drawn -- associated gene in input genesubset
    	@param population -- population (here use the offical home sapiens genes in NCBI)
    	@param white_balls_in_population -- assoicated genes in population
   		@param total_balls_drawn -- input genelist size
		"""
        prob = 1 - hypergeom.cdf(white_balls_drawn - 1, population,
                                 white_balls_in_population, total_balls_drawn)
        return prob
def specificities(lexicalTable,annotationType):
    from scipy.stats import hypergeom
    M=lexicalTable.sum().sum()
    lengths=pd.DataFrame(lexicalTable.sum())
    freq=pd.DataFrame(lexicalTable.sum(axis=1))
    expectedCounts=(freq.dot(lengths.transpose()))/M
    specif=lexicalTable.copy()
    for part in lexicalTable.columns:
        sys.stdout.write("\r5/6 - "+annotationType+" - Calcul des spécificités pour le département "+str(part))
        for word in lexicalTable.index:
            if lexicalTable.loc[word,part]<expectedCounts.loc[word,part] :
                specif.loc[word,part]=hypergeom.cdf(lexicalTable.loc[word,part],M, freq.loc[word], lengths.loc[part])
            else:
                specif.loc[word,part]=1-hypergeom.cdf(lexicalTable.loc[word,part]-1,M, freq.loc[word], lengths.loc[part])
    specif=np.log10(specif)
    specif[lexicalTable>=expectedCounts]=-specif[lexicalTable>=expectedCounts]
    sys.stdout.write("\n")
    # si on veut des valeurs tronquées
    for dep in specif :
        specif.loc[specif[dep] > 10,dep] = 10
        specif.loc[specif[dep] < -10,dep] = -10
    return specif
Ejemplo n.º 34
0
def accumulative_hypergeometric(k, n, K, N):
    '''
    [k]: SUCCESS IN THE CLUSTER
    [n]: SIZE OF THE CLUSTER
    [K]: SUCCESS IN POPULATION
    [N]: SIZE OF THE POPULATION
    '''
    k, n, K, N = int(k), int(n), int(K), int(N)
    sf = hyp.sf(k, N, K, n)
    if sf < 1:
        return sf + hyp.pmf(k, N, K, n)
    else:
        return 1 - hyp.cdf(k, N, K, n) + hyp.pmf(k, N, K, n)
Ejemplo n.º 35
0
def hypergeometric(objects_in_bin, total_size, objects_total, bin_size):
    p_over = np.log10(
        hypergeom.sf(objects_in_bin - 1, total_size, objects_total, bin_size))
    p_under = np.log10(
        hypergeom.cdf(objects_in_bin, total_size, objects_total, bin_size))
    if p_over < p_under:
        p = -p_over
    else:
        p = p_under
    if abs(p) > 3:
        return p / (abs(p)) * 3
    else:
        return p
Ejemplo n.º 36
0
def plotheatmap_hypertest(ann_heatmap,
                          genes,
                          groupby,
                          filename,
                          vmin=0,
                          vmax=1.5,
                          figsize=(23, 2)):
    from scipy.stats import hypergeom
    ann_heatmap = ann_heatmap.copy()
    ann_heatmap.X = ann_heatmap.raw.X
    ann_heatmap = ann_heatmap[:, ann_heatmap.var_names.isin(genes)].copy()
    ann_heatmap.var['ncells'] = (ann_heatmap.X > 0).sum(axis=0).tolist()[0]
    records = {}
    htmap_df = pd.DataFrame(columns=ann_heatmap.var_names)
    #nn_heatmap.
    M = len(ann_heatmap.obs)
    for grp in ann_heatmap.obs[groupby].unique():
        test_group = ann_heatmap[ann_heatmap.obs[groupby] == grp].copy()
        test_group.var['ncells'] = (test_group.X > 0).sum(axis=0).tolist()[0]

        N = len(test_group.obs)
        records[grp] = []
        for g in genes:
            n = ann_heatmap.var.loc[g]['ncells']
            x = test_group.var.loc[g]['ncells']
            # add a small value to avoid 0 as a pvalue
            hyper_test_pval = 1 - hypergeom.cdf(x, M, n, N) + 10e-30
            records[grp].append(hyper_test_pval)

    htmap_df = pd.DataFrame.from_dict(records, orient='index', columns=genes)
    htmap_df = htmap_df.sort_index()
    f, ax = plt.subplots(1, 1, figsize=figsize)
    sns.heatmap(
        htmap_df.applymap(lambda x: -np.log10(x)),
        square=False,
        cmap=sns.light_palette('red', as_cmap=True),
        vmin=vmin,
        vmax=vmax,
        linewidths=.5,
        cbar_kws=dict(use_gridspec=False,
                      aspect=8,
                      label='$-log(p_{val})$',
                      anchor=(-0.3, 0.0)),
        ax=ax,
    )
    fn = filename
    f.savefig(fn + '.pdf', bbox_inches='tight', dpi=300)
    plt.close(f)
    return htmap_df
Ejemplo n.º 37
0
def compare_cols(fg_col, fg_cons, fg_size, fg_weights,
                 bg_col, bg_cons, bg_size, bg_weights,
                 aa_freqs, pseudo_size):
    "Compare alignments using the hypergeometric model"
    # Number of consensus-type residues in the foreground column
    fg_cons_count = count_col(fg_col, fg_weights)[fg_cons]
    # Consensus residue frequency in the combined alignment column
    p_j = count_col(bg_col, bg_weights)[fg_cons] + fg_cons_count
    # Round fg counts & size to nearest integer for hypergeometric test
    fg_cons_count_i = max(1, int(ceil(fg_cons_count)))
    fg_size_i = int(ceil(fg_size))
    bg_size_i = int(ceil(bg_size))
    # Probability of fg col conservation vs. the combined/main set
    pvalue = 1.0 - hypergeom.cdf(fg_cons_count_i - 1, fg_size_i + bg_size_i,
                                 p_j, fg_size_i)
    return pvalue
Ejemplo n.º 38
0
def doHyperG(genelist, allgenes, allterms, assocname):

    geneswithterms = allgenes.keys()
    termswithgenes = allterms.keys()

    M=len(geneswithterms)
    N=len(list(set(geneswithterms).intersection(set(genelist))))

    pvalues=[]
    termsingenelist=[]
    termsinbackground=[]
    termname=[]

    for t in termswithgenes:
        n = len(allterms[t])
        x = len(list(set(allterms[t]).intersection(set(genelist))))
        if x == 0:
            continue

        pvalue = 1.0 - hypergeom.cdf(x,M,n,N)
        pvalues.append(pvalue)

        termsingenelist.append(x)
        termsinbackground.append(n)
        termname.append(t)

    adjpvalue = list(fdrcorrection0(pvalues)[1])

    print("\t".join(["Term annotation", "pvalue", "fdr adj pvalue","Background","Expected","GeneList","Observed","Genes"]))
    for u in range(0,len(adjpvalue)):
        gotermname = termname[u]
        if termname[u] in assocname.keys():
            gotermname = assocname[termname[u]]
        print("\t".join([gotermname,
                         str(pvalues[u]),
                         str(adjpvalue[u]),
                         str(M),
                         str(termsinbackground[u]),
                         str(N),
                         str(termsingenelist[u]),
                         ",".join(list(set(allterms[termname[u]]).intersection(set(genelist))))]
                        )
              )
def hypergeometric_test(x, M, n, N):
    """
    The hypergeometric distribution models drawing objects from a bin.
    - M is total number of objects
    - n is total number of Type I objects. 
    - x (random variate) represents the number of Type I objects in N drawn without replacement from the total population

    - http://en.wikipedia.org/wiki/Hypergeometric_distribution
    - https://www.biostars.org/p/66729/
    - http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.hypergeom.html
    - http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.hypergeometric.html
    - http://stackoverflow.com/questions/6594840/what-are-equivalents-to-rs-phyper-function-in-python
    """

    assert n <= M
    assert x <= n
    assert N <= M
    pv_le = hypergeom.cdf(x+1, M, n, N)
    pv_gt = hypergeom.sf(x-1, M, n, N)# 1-cdf sometimes more accurate
    return pv_le, pv_gt
Ejemplo n.º 40
0
def _TEST_ (block,exptotal,bkgtotal,pseudo_count):
	sigPoi=0
	sigHyp=0
	ind=block.index
	for i in block.index:
	        lamda=block.loc[i,'BkgHop_withBonus']
        	obs=block.loc[i,'ExpHop_withBonus']
		BkgHop=block.loc[i,'BkgHop']
        	ExpHop=block.loc[i,'ExpHop']
        	P_poisson=1-poisson.cdf(int(obs)-1,lamda+pseudo_count)
	        if int(BkgHop) < 100000000 and int(ExpHop) < 100000000:
        	        P_hyper=1-hypergeom.cdf(ExpHop-1,(bkgtotal+exptotal),exptotal, (ExpHop+BkgHop))
        	else:
                	P_hyper='***'
		block.loc[i,'BkgFraction']=float(BkgHop/bkgtotal)
		block.loc[i,'ExpFraction']=float(ExpHop/exptotal)
		block.loc[i,'P_Hyper']= P_hyper
                block.loc[i,'P_Poisson']=P_poisson
		if P_poisson < alpha and lamda*obs != 0:
		        sigPoi=sigPoi+1
	        if P_hyper < alpha:
        	        sigHyp=sigHyp+1
	return block, sigPoi, sigHyp
Ejemplo n.º 41
0
def mutual_exclusivity(array_a, array_b, verbose=1):
    """ Performs a hypergeometric test assessing the probability of array_a and array_b are mutual exclusive.

    Arguements:
        array_a, array_b: boolean array - discretised values

    Returns:
        p-value: float
    """

    a, b = np.array(array_a), np.array(array_b)

    x = np.sum(np.bitwise_and(a == False, b == True)) - 1

    M = len(a)

    n = np.sum(a == False)

    N = np.sum(b == True)

    if verbose > 0:
        print '[INFO] x, M, n, N: ', x, M, n, N

    return 1.0 - hypergeom.cdf(x, M, n, N)
Ejemplo n.º 42
0
	control.append(get_control(path))
	sys.stderr.write("%d blocks were read from\t%s\n" % (len(control[-1]), path))

counts, total = count_conservation(signal, iid2seed, matches);	

r_control = []
for maf_control in control:
	cc,tc = count_control(maf_control, seeds, matches)
	r_control.append((cc,tc))
	

for k,v in Counter(total).most_common(10):
	name = mir_lib.shortname(seed2mirs[k])
	a = [name, counts[k], v, counts[k]/float(v)]
	for cc,tc in r_control:
		rawp = (1 - hypergeom.cdf(counts[k], tc[k] ,cc[k] , v)) 
		p_value = "%.2e" % rawp;
		a += [cc[k], tc[k], cc[k]/(tc[k] + 0.1), p_value]
	print "\t".join([str(x) for x in a])

#micros = parsing.getDictFromFa(args.micro);
#micros = parsing.to_upper_case(micros);

#for key, value in micros.iteritems():
  #seed_dict[reverse_complement(value[1:7])].append(key);


#counts_true, total_true = count_conservation(maf_dict, micros)
#seeds = set([reverse_complement(x[1:7]) for x in micros.values()])
#counts_control, total_control  = count_control(maf_control, seeds)
Ejemplo n.º 43
0
def enrichment_test(the_present_set, the_absent_set, the_grouping_dict, **kwargs):
    """ Hypergeometric testing for enrichment.

    Arguments:
     the_present_set: a list or set of items that were tested and were positive
     the_absent_set: a list or set of items that were tested for but
      were not present / significant
     the_grouping_dict: a dict of items to check for enrichment.
      key: grouping name, e.g. GO enrichment category, complex name, etc...
      value: a list or set of items in the list to test, e.g. gene or protein id's

    kwargs:
     filter_items: [True (default) / False]
      Whether to filter the items in the_grouping_dict and 
      the_present_set / the_absent_set so only items
      in common are tested.
     verbose: [False(default) / True]
     method: type of mt testing to apply, default is "none", see
      mtcorrect() for more information
     direction: ["enrichment" (default), "depletion"]
      essentially, whether to look at the right or left tail

    returns:
     test_result_dict: a dict with:
      {the_grouping: {'p': the corrected p-value, 
       'n_present':  number in the grouping that were present,
       'n_group': the total size of the group}

    
    """
    from copy import deepcopy
    from scipy.stats import hypergeom

    method = test_kwarg('method', kwargs, mtcorrect_methods)
    
    filter_items = test_kwarg('filter_items', kwargs, [True, False])
    verbose = test_kwarg('verbose', kwargs, [False, True])
    method = test_kwarg('method', kwargs, mtcorrect_methods)
    direction = test_kwarg('direction', kwargs, ["enrichment", "depletion"])

    the_present_set = set(the_present_set)
    the_absent_set = set(the_absent_set)
    all_test_items_set = the_present_set |  the_absent_set
    
    the_grouping_dict = deepcopy(the_grouping_dict)
    all_grouping_items_set = set([])
    the_groupings_to_test = deepcopy(the_grouping_dict.keys())
    for the_grouping in the_groupings_to_test:
        the_grouping_dict[the_grouping] = set(the_grouping_dict[the_grouping])
        if filter_items:
            the_grouping_dict[the_grouping] = the_grouping_dict[the_grouping] & all_test_items_set
        all_grouping_items_set.update(the_grouping_dict[the_grouping])
        if len(the_grouping_dict[the_grouping]) == 0:
            the_grouping_dict.pop(the_grouping)

    if filter_items:
        the_present_set = the_present_set & all_grouping_items_set
        the_absent_set = the_absent_set & all_grouping_items_set
        all_test_items_set = all_test_items_set & all_grouping_items_set

    test_result_dict = {}
    n_present_dict = {}
    group_size_dict = {}

    for the_grouping in the_grouping_dict.keys():
        # x: number of present items in the grouping
        # M: total number of items
        # n: total number of present items
        # N: total number of items in the grouping
        the_grouping_set = the_grouping_dict[the_grouping]
        x = len(the_present_set & the_grouping_set)
        M = len(all_test_items_set)
        n = len(the_present_set)
        N = len(the_grouping_set)
        if direction == "enrichment":
            # We want the probability that x or more than x can be chosen randomly,
            # so we must subtract 1
            if x >= 1:
                the_p = hypergeom.sf(x-1,M,n,N,loc=0)
            else:
                the_p = 1.
        else:
            the_p = hypergeom.cdf(x,M,n,N,loc=0)
        
        test_result_dict[the_grouping] = the_p
        n_present_dict[the_grouping] = x
        group_size_dict[the_grouping] = N

    corrected_p_dict = mtcorrect(test_result_dict, method = method) 
        
    final_result_dict = {}
    for the_grouping in the_grouping_dict.keys():
        final_result_dict[the_grouping] = {}
        final_result_dict[the_grouping]['n_present'] = n_present_dict[the_grouping]
        final_result_dict[the_grouping]['n_group'] = group_size_dict[the_grouping]
        final_result_dict[the_grouping]['p'] = corrected_p_dict[the_grouping]


    return final_result_dict
def Main():
    t1=time()
    args=ParseArg()
    inp = open(args.input, 'r')
    min_clusterS=args.min_clusterS
    min_interaction=args.min_interaction
    p_value=args.p_value
    output=open(args.output,'w')
    outputIntra = open(args.output_intra, 'w')

    hasAnnotation = False
    if args.annotation:
        dbi = DBI.init(args.annotation, "bed")
        hasAnnotation = True
    else:
        dbi = False

    if args.annotation_repeat:
        dbirepeat = DBI.init(args.annotation_repeat, "bed")
        hasAnnotationRepeat = True
    else:
        dbirepeat = False        

    #store count of RNA for part1 and part2
    part={}


    k=0
    sgcount = 0 #single fragment count
    
    print >> sys.stderr,"# Inputing data..."
    interaction = {}  # store number of interactions for different RNA
    selfinteraction = {}



    #Types = ["snoRNA","protein_coding","snRNA","lincRNA","tRNA","misc_RNA","pseudogene","miRNA","antisense","sense_intronic","non_coding","processed_transcript","sense_overlapping","rRNA_repeat","rRNA"]
    for line in inp.read().split('\n'):
        if line=='': continue
        line=line.strip().split('\t')
        p1=annotated_bed_proper(line[0:10],id=k,cluster=1)
        p2=annotated_bed_proper(line[11:],id=k,cluster=1)
        if isinstance(p1.start, list):
            p1.start=int(p1.start[0])
            p1.end=int(p1.end[-1])
        if isinstance(p2.start, list):
            p2.start=int(p2.start[0])
            p2.end=int(p2.end[-1])
                
        if SingleFragment(p1,p2):
            sgcount += 1
            continue
        k+=1
        #if p1.subtype=="intron" or p2.subtype=="intron": continue
        #if p1.type in Types:
        try:
            p1_name = GetAnnotationName(p1, hasAnnotation, dbi, hasAnnotationRepeat, dbirepeat) 
            if p1_name not in part:
                part[p1_name]=1
            else:
                part[p1_name]+=1  
            #if p2.type in Types:
            p2_name = GetAnnotationName(p2, hasAnnotation, dbi, hasAnnotationRepeat, dbirepeat) 
            if not p1_name == p2_name: # count once for self-interaction
                if p2_name not in part:
                    part[p2_name]=1
                else:
                    part[p2_name]+=1
            #if p1.type in Types and p2.type in Types:
            if p1_name == p2_name:
                if p1_name not in selfinteraction:
                    selfinteraction[p1_name]=copy.deepcopy(p1)
                else:
                    selfinteraction[p1_name].Update(p1.start, p1.end)
                    selfinteraction[p1_name].Update(p2.start, p2.end)
                    selfinteraction[p1_name].cluster += 1
            else:
                if p1_name>p2_name:
                    temp = p1
                    p1 = p2
                    p2 = temp
                    tempName = p1_name
                    p1_name = p2_name
                    p2_name = tempName
                inter_name = p1_name + "--" + p2_name
                if inter_name not in interaction:
                    interaction[inter_name]=[copy.deepcopy(p1),copy.deepcopy(p2)]
                else:
                    interaction[inter_name][0].Update(p1.start,p1.end)
                    interaction[inter_name][1].Update(p2.start,p2.end)
                    interaction[inter_name][0].cluster+=1
        except Exception as e:
            print >> sys.stderr, e
        if k%20000==0: 
            print >> sys.stderr,"  Reading %d pairs of segments\r"%(k),
    print >> sys.stdout,"Get total %d pairs."%(k)
    print >> sys.stdout,"Single fragment count: %d."%(sgcount)

    print >>sys.stdout,"   number of different RNAs is %d          "%(len(part))
    
    total = k # total pairs used
    n=0
    k=0  # record number of strong interactions
    for i in interaction:
        n+=1
        count = interaction[i][0].cluster
        if count < min_interaction: continue
        p1_name = i.split("--")[0]
        p2_name = i.split("--")[1]
        P1 = interaction[i][0]
        P2 = interaction[i][1]
        P1.cluster = part[p1_name]
        P2.cluster = part[p2_name]
        if part[p1_name]<min_clusterS or part[p2_name]<min_clusterS: continue
        real_p=1-hypergeom.cdf(count,total,part[p1_name],part[p2_name])
        if real_p<=p_value:
            k=k+1
            try:
                log_p = math.log(real_p)
            except:
                log_p = -float("Inf")
            print >> output, str(P1)+'\t'+str(P2)+'\t%d\t%.4f'%(count,log_p)
        if n%500==0: print >> sys.stderr, "  Progress ( %d / %d )\r"%(n,len(interaction)),
    k1=0
    for i in selfinteraction:
        n+=1
        count = selfinteraction[i].cluster
        if count < min_interaction: continue
        p1_name = i
        P1 = selfinteraction[i]
        P1.cluster = part[p1_name]
        if part[p1_name]<min_clusterS: continue
        k1=k1+1
        print >> outputIntra, str(P1)+'\t%d'%(count)
        if n%500==0: print >> sys.stderr, "  Progress ( %d / %d )\r"%(n,len(interaction)),
    print >> sys.stdout,"# Find %d strong and %d self interactions. Cost time: %.2f s"%(k,k1,time()-t1)
Ejemplo n.º 45
0
def Main():
    t1=time()
    
    global min_interaction, p_value
    args=ParseArg()
    inp = open(args.input, 'r')
    min_clusterS=args.min_clusterS
    min_interaction=args.min_interaction
    p_value=args.p_value
    output=open(args.output,'w')
    ncpus=args.parallel


    #store genomic location of part1 and part2
    part1=[]
    part2=[]


    k=0
    
    print >> sys.stderr,"# Inputing data..."

    chr_list=[]
    for line in inp.read().split('\n'):
        if line=='': continue
        line=line.strip().split('\t')
        p1=annotated_bed(line[0:10],id=k)
        p2=annotated_bed(line[11:],id=k)
        if isinstance(p1.start, list):
            p1.start=int(p1.start[0])
            p1.end=int(p1.end[-1])
        if isinstance(p2.start, list):
            p2.start=int(p2.start[0])
            p2.end=int(p2.end[-1])
        if SingleFragment(p1,p2): continue
        k+=1
        part1.append(p1)
        part2.append(p2)
        if p1.chr not in chr_list: chr_list.append(p1.chr)
        if p2.chr not in chr_list: chr_list.append(p2.chr)
        if k%20000==0: 
            print >> sys.stderr,"  Reading %d pairs of segments\r"%(k),
    print >> sys.stderr,"Get total %d pairs."%(k)
    
    if len(part1)!=len(part2):
        print >> sys.stderr, "## ERROR: number of regions in two part not match!!"
        sys.exit(0)

    # sort in genomic order, easy for clustering
    part1=sorted(part1, key=attrgetter('start'))
    part1=sorted(part1, key=attrgetter('chr'))
    part2=sorted(part2, key=attrgetter('start'))
    part2=sorted(part2, key=attrgetter('chr'))

    # for parallel computing 
    print >>sys.stderr,"# Generating clusters for two parts..."
    # tuple of all parallel python servers to connect with
    ppservers = ()
    job_server = pp.Server(ncpus, ppservers=ppservers)
    jobs1=[]
    jobs2=[]
    for chro in chr_list:
        part1_temp=filter(lambda p: p.chr==chro, part1)
        if len(part1_temp)>0:
            jobs1.append(job_server.submit(cluster_regions,(part1_temp,min_clusterS),(annotated_bed,),("UnionFind","copy",)))
        part2_temp=filter(lambda p: p.chr==chro, part2)
        if len(part2_temp)>0:
            jobs2.append(job_server.submit(cluster_regions,(part2_temp,min_clusterS),(annotated_bed,),("UnionFind","copy",)))
        

    cluster_pool1={}
    part1=[]
    for job in jobs1: 
        try:
            part1=part1+job()[1]
            cluster_pool1.update(job()[0])
        except:
            print >> sys.stderr, "Wrong in %s, part1"%(job()[2])
            continue
    cluster_pool2={}
    part2=[]
    for job in jobs2:
        try:
            part2=part2+job()[1]
            cluster_pool2.update(job()[0])
        except:
            continue


    print >>sys.stderr,"   cluster number for part1 is %d          "%(len(cluster_pool1))
    print >>sys.stderr,"   cluster number for part2 is %d          "%(len(cluster_pool2))

    # sort back to pair two parts
    part1=sorted(part1, key=attrgetter('id'))
    part2=sorted(part2, key=attrgetter('id'))

    print >> sys.stderr,"size of part1&2:",len(part1),len(part2)

    c_interaction={}
    for i in range(len(part1)):
        region1=str(part1[i])
        region2=str(part2[i])
        try:
            inter=part1[i].cluster+"--"+part2[i].cluster
        except:
            print >> sys.stderr,i,part1[i].cluster,part2[i].cluster
            sys.exit()
        if c_interaction.has_key(inter):
            c_interaction[inter]+=1
        else:
            c_interaction[inter]=1

    # annotation file
    print >> sys.stderr,"# Indexing annotation files"
    dbi_all=DBI.init(args.annotation,"bed")
    dbi_detail=DBI.init(args.db_detail,"bed")
    dbi_repeat=DBI.init("/home/yu68/bharat-interaction/new_lincRNA_data/mouse.repeat.txt","bed")


    print >> sys.stderr,"# finding strong interactions from clusters..."
    k=0 # record for strong interactions
    n=0

    # annotation file

    for interaction in c_interaction:
        n=n+1
        count=c_interaction[interaction]
        if count<min_interaction: continue
        i=interaction.split("--")[0]
        j=interaction.split("--")[1]
        try:  # we select clusters with size no less than 5, so some interactions cannot be found in clusters
            count1=cluster_pool1[i].cluster
            count2=cluster_pool2[j].cluster
        except:
            continue
        real_p=1-hypergeom.cdf(count,len(part1),count1,count2)
        if real_p<=p_value:
            k=k+1
            cluster_pool1[i].Annotate(dbi_all,dbi_detail,dbi_repeat)
            cluster_pool2[j].Annotate(dbi_all,dbi_detail,dbi_repeat)
            try:
                log_p = math.log(real_p)
            except:
                log_p = -float("Inf")
            print >> output,str(cluster_pool1[i])+'\t'+str(cluster_pool2[j])+'\t%d\t%.4f'%(count,log_p)
        if n%1000==0: print >> sys.stderr, "  Progress ( %d / %d )\r"%(n,len(c_interaction)),

    print >> sys.stderr,"# Find %d strong interactions. Cost time: %.2f s"%(k,time()-t1)

    if args.FDR:
        print >> sys.stderr, "# Permutated results:"
        for i in range(10):
            shuffle(part2)
            [n_r_I,n_r_SI]=Random_strongInteraction(part1,part2,cluster_pool1,cluster_pool2)
            print >> sys.stderr, "  ",i, n_r_I, n_r_SI, n_r_SI*1.0/n_r_I
Ejemplo n.º 46
0
def enrichment_hypergeo(
    termList, entityList, species, useIea=True, asGenes=True, aspect="biological_process", verbose=True
):

    """
    termList -- are the terms to be tested
    species  -- an ncbi taxa id
    entityList -- gene or uniprot ids

    What is the probability of finding a given number of terms if we randomly select N out of M objects?

    M -- genes with at least one annotation
    N -- number of draws or size of gene list
    k -- the number of genes annotated by a given term (total type I objects)
    x -- number of times we observe a term in the gene list (draws)

    in R the cdf can be obtained with
    phyper(x,k,M-k,N)
    hypergeom.pmf(x, M, k, N)
    
    Returns a dict where term id is the key and hypergeo pvalue is the value
    """

    ## connect to db and get annotations for the species
    session, engine = db_connect()
    geneAnnots, uniprotAnnots = fetch_taxa_annotations([species], engine, useIea=useIea, verbose=verbose, aspect=aspect)

    if asGenes == True:
        entity2go = geneAnnots
    else:
        entity2go = uniprotAnnots

    go2entity = {}
    for entity, go in entity2go.iteritems():
        for term in go:
            if not go2entity.has_key(term):
                go2entity[term] = set([])
            go2entity[term].update([entity])
    for go, entity in go2entity.iteritems():
        go2entity[go] = list(entity)

    print ("total go terms - %s" % (len(go2entity.keys())))
    print ("total entities - %s" % (len(entity2go.keys())))

    ## set variables
    M = len(entity2go.keys())
    N = len(entityList)
    results = {}

    for testTerm in termList:
        ## find
        k = len(go2entity[testTerm])
        x = 0
        for entity in entityList:
            if entity in entity2go and testTerm in entity2go[entity]:
                x += 1

        ## get a p-value
        if 0 in [x, M, N, k]:
            pvalue = np.nan
        else:
            cdf = hypergeom.cdf(x, M, k, N, loc=0)
            if cdf > 0:
                pvalue = 2 * (1 - hypergeom.cdf(x, M, k, N))
            else:
                pvalue = 2 * hypergeom.cdf(x, M, k, N)
        results[testTerm] = pvalue

    return results
Ejemplo n.º 47
0
def Main():
    t1=time()
    
    global min_interaction, p_value
    args=ParseArg()
    inp = open(args.input, 'r')
    min_clusterS=args.min_clusterS
    min_interaction=args.min_interaction
    p_value=args.p_value
    output=open(args.output,'w')


    #store genomic location of part1 and part2
    part1=[]
    part2=[]


    k=0
    
    print >> sys.stderr,"# Inputing data..."
    for line in inp.read().split('\n'):
        if line=='': continue
        line=line.strip().split('\t')
        k=k+1
        part1.append(annotated_bed(line[0:7],id=k))
        part2.append(annotated_bed(line[8:],id=k))
        if k%20000==0: 
            print >> sys.stderr,"  Reading %d pairs of segments\r"%(k),
    print >> sys.stderr,"Get total %d pairs."%(k)
    
    if len(part1)!=len(part2):
        print >> sys.stderr, "## ERROR: number of regions in two part not match!!"
        sys.exit(0)

    # sort in genomic order, easy for clustering
    part1=sorted(part1, key=attrgetter('start'))
    part1=sorted(part1, key=attrgetter('chr'))
    part2=sorted(part2, key=attrgetter('start'))
    part2=sorted(part2, key=attrgetter('chr'))


    print >>sys.stderr,"# Generating clusters for two parts..."
    print >>sys.stderr,"  Part1:"
    cluster_pool1=cluster_regions(part1,min_clusterS)
    print >>sys.stderr,"   cluster number for part1 is %d          "%(len(cluster_pool1))
    print >>sys.stderr,"  Part2:"
    cluster_pool2=cluster_regions(part2,min_clusterS)
    print >>sys.stderr,"   cluster number for part2 is %d          "%(len(cluster_pool2))

    # sort back to pair two parts
    part1=sorted(part1, key=attrgetter('id'))
    part2=sorted(part2, key=attrgetter('id'))

    c_interaction={}
    for i in range(len(part1)):
        region1=str(part1[i])
        region2=str(part2[i])
        inter="%d--%d"%(part1[i].cluster,part2[i].cluster)
        if c_interaction.has_key(inter):
            c_interaction[inter]+=1
        else:
            c_interaction[inter]=0

    print >> sys.stderr,"# finding strong interactions from clusters..."
    k=0 # record for strong interactions
    n=0
    for interaction in c_interaction:
        n=n+1
        count=c_interaction[interaction]
        if count<min_interaction: continue
        i=int(interaction.split("--")[0])
        j=int(interaction.split("--")[1])
        try:  # we select clusters with size no less than 5, so some interactions cannot be found in clusters
            count1=cluster_pool1[i].cluster
            count2=cluster_pool2[j].cluster
        except:
            continue
        real_p=1-hypergeom.cdf(count,len(part1),count1,count2)
        if real_p<=p_value:
            k=k+1
            print >> output,str(cluster_pool1[i])+'\t'+str(cluster_pool2[j])+'\t%d\t%.5f'%(count,real_p)
        if n%1000==0: print >> sys.stderr, "  Progress ( %d / %d )\r"%(n,len(c_interaction)),

    print >> sys.stderr,"# Find %d strong interactions. Cost time: %.2f s"%(k,time()-t1)

    if args.FDR:
        print >> sys.stderr, "# Permutated results:"
        for i in range(10):
            shuffle(part2)
            [n_r_I,n_r_SI]=Random_strongInteraction(part1,part2,cluster_pool1,cluster_pool2)
            print >> sys.stderr, "  ",i, n_r_I, n_r_SI, n_r_SI*1.0/n_r_I
Ejemplo n.º 48
0
    rxnpvals = np.array(rxnpvals)
    recon2subs = np.array(recon2subs)
    rxnrangediffs = np.array(rxnrangediffs)
    uniqSubs = np.unique(recon2subs)
    numrxnspos = []
    numsigrxnspos = []
    hypergeomparrpos = []
    for sub in uniqSubs:
        N = len(recon2rxns)
        M = sum(recon2subs==sub)
        K = sum(np.logical_and(rxnpvals<.05,rxnrangediffs>0))
        x = sum(np.logical_and(np.logical_and(rxnpvals<.05,rxnrangediffs>0),recon2subs==sub))
        numrxnspos.append(M)
        numsigrxnspos.append(x)
        hypergeomparrpos.append(1-hypergeom.cdf(x-1,N,M,K))
    numrxnsneg = []
    numsigrxnsneg = []
    hypergeomparrneg = []
    for sub in uniqSubs:
        N = len(recon2rxns)
        M = sum(recon2subs==sub)
        K = sum(np.logical_and(rxnpvals<.05,rxnrangediffs<0))
        x = sum(np.logical_and(np.logical_and(rxnpvals<.05,rxnrangediffs<0),recon2subs==sub))
        numrxnsneg.append(M)
        numsigrxnsneg.append(x)
        hypergeomparrneg.append(1-hypergeom.cdf(x-1,N,M,K))
    labelarr = []
    valuesarr = []
    youngvals = totalvar['young'].values()
    for i in range(len(youngvals)):
Ejemplo n.º 49
0
def _motif_sig(fore_hits, fore_size, back_hits, back_size):
    return (
        1 -
        hypergeom.cdf(fore_hits, back_size, back_hits, fore_size) +
        hypergeom.pmf(fore_hits, back_size, back_hits, fore_size)
    )
Ejemplo n.º 50
0
def cphyper(k, K, n, N):
    return 1.0 - hypergeom.cdf(k - 1, N, n, K)
Ejemplo n.º 51
0
import matplotlib.pyplot as plt
from scipy.stats import hypergeom, rv_discrete
import numpy as np
numargs = hypergeom.numargs
#[ M, n, N ] = [100, 10, -1]

#Display frozen pmf:

rv = hypergeom( 10, 20, 3 )
print rv.dist.b
x = np.arange( 0, np.min( rv.dist.b, 3 ) + 1 )
h = plt.plot( x, rv.pmf( x ) )
exit()
#Check accuracy of cdf and ppf:

prb = hypergeom.cdf( x, M, n, N )
h = plt.semilogy( np.abs( x - hypergeom.ppf( prb, M, n, N ) ) + 1e-20 )

#Random number generation:

R = hypergeom.rvs( M, n, N, size=100 )

#Custom made discrete distribution:

vals = [np.arange( 7 ), ( 0.1, 0.2, 0.3, 0.1, 0.1, 0.1, 0.1 )]
custm = rv_discrete( name='custm', values=vals )
h = plt.plot( vals[0], custm.pmf( vals[0] ) )

Ejemplo n.º 52
0
def fisher_exact(c) :
    """Performs a Fisher exact test on a 2x2 contingency table.

    Parameters
    ----------
    c : array_like of ints
        A 2x2 contingency table.

    Returns
    -------
    oddsratio : float
        This is prior odds ratio and not a posterior estimate.
    p-value : float
        P-value for 2-sided hypothesis of independence.


    Examples
    --------
    >>> fisher_exact([[100, 2], [1000, 5]])
    (0.25, 0.13007593634330314)
    """

    c = np.asarray(c, dtype=np.int64)  # int32 is not enough for the algorithm
    odssratio = c[0,0] * c[1,1] / float(c[1,0] * c[0,1]) \
                            if (c[1,0] > 0 and c[0,1] > 0) else np.inf
    n1 = c[0,0] + c[0,1]
    n2 = c[1,0] + c[1,1]
    n  = c[0,0] + c[1,0]

    mode = int(float((n + 1) * (n1 + 1)) / (n1 + n2 + 2))
    pexact = hypergeom.pmf(c[0,0], n1 + n2, n1, n)
    pmode = hypergeom.pmf(mode, n1 + n2, n1, n)

    epsilon = 1 - 1e-4
    if float(np.abs(pexact - pmode)) / np.abs(np.max(pexact, pmode)) <= 1 - epsilon:
        return odssratio, 1

    elif c[0,0] < mode:
        plower = hypergeom.cdf(c[0,0], n1 + n2, n1, n)

        if hypergeom.pmf(n, n1 + n2, n1, n) > pexact / epsilon:
            return odssratio, plower

        # Binary search for where to begin upper half.
        min = mode
        max = n
        guess = -1
        while max - min > 1:
            guess = max if max == min + 1 and guess == min else (max + min) / 2

            pguess = hypergeom.pmf(guess, n1 + n2, n1, n)
            if pguess <= pexact and hypergeom.pmf(guess - 1, n1 + n2, n1, n) > pexact:
                break
            elif pguess < pexact:
                max = guess
            else:
                min = guess

        if guess == -1:
            guess = min

        while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon:
            guess -= 1

        while hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon:
            guess += 1

        p = plower + hypergeom.sf(guess - 1, n1 + n2, n1, n)
        if p > 1.0:
            p = 1.0
        return odssratio, p
    else:
        pupper = hypergeom.sf(c[0,0] - 1, n1 + n2, n1, n)
        if hypergeom.pmf(0, n1 + n2, n1, n) > pexact / epsilon:
            return odssratio, pupper

        # Binary search for where to begin lower half.
        min = 0
        max = mode
        guess = -1
        while max - min > 1:
            guess = max if max == min + 1 and guess == min else (max + min) / 2
            pguess = hypergeom.pmf(guess, n1 + n2, n1, n)
            if pguess <= pexact and hypergeom.pmf(guess + 1, n1 + n2, n1, n) > pexact:
                break
            elif pguess <= pexact:
                min = guess
            else:
                max = guess

        if guess == -1:
            guess = min

        while hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon:
            guess += 1

        while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon:
            guess -= 1

        p = pupper + hypergeom.cdf(guess, n1 + n2, n1, n)
        if p > 1.0:
            p = 1.0
        return odssratio, p
Ejemplo n.º 53
0
		lc = l.strip("\n").split("\t")
		tf[lc[1]] = 1
		if lc[1] in expr:
			tf_num = tf_num + 1
		if lc[1] in degs:
			tf_degs = tf_degs + 1
	
	#print expr_num
	#print degs_num
	#print tf_num
	#print tf_degs
	
	## [M, n, N]
	[express_genes, degs, tf_binding_genes] = [expr_num, degs_num, tf_num]
	#rv = hypergeom(express_genes, degs, tf_binding_genes)
	#x = np.arange(0, 10)
	#pmf_tfBindDegs = rv.pmf(1)
	#print np.sum(pmf_tfBindDegs)
	#print 1-np.sum(pmf_tfBindDegs)
	# the probability less than tf_degs
	prb = hypergeom.cdf(tf_degs, express_genes, degs, tf_binding_genes)
	#pvalue = 1 - hypergeom.cdf(tf_degs, express_genes, degs, tf_binding_genes)
	
	#if pvalue > 1:
	#	print >>sys.stderr, prefix
	
	prefix = sys.argv[4]

	print "%s\t%.12f\t%d\t%d\t%d\t%d" % (prefix, prb, express_genes, degs, tf_binding_genes, tf_degs)

def Main():
    t1=time()
    
    global min_interaction, p_value
    args=ParseArg()
    inp = open(args.input, 'r')
    min_clusterS=args.min_clusterS
    min_interaction=args.min_interaction
    p_value=args.p_value
    output=open(args.output,'w')
    ncpus=args.parallel


    #store genomic location of part1 and part2
    part=[]


    k=0
    
    print >> sys.stderr,"# Inputing data..."

    chr_list=[]
    for line in inp.read().split('\n'):
        if line=='': continue
        line=line.strip().split('\t')
        p1=annotated_bed(line[0:8],id=k,part=1)
        p2=annotated_bed(line[9:],id=k,part=2)
        if SingleFragment(p1,p2): continue
        k+=1
        part.append(p1)
        part.append(p2)
        if p1.chr not in chr_list: chr_list.append(p1.chr)
        if p2.chr not in chr_list: chr_list.append(p2.chr)
        if k%20000==0: 
            print >> sys.stderr,"  Reading %d pairs of segments\r"%(k),
    print >> sys.stderr,"Get total %d pairs."%(k)
    

    # sort in genomic order, easy for clustering
    part=sorted(part, key=attrgetter('start'))
    part=sorted(part, key=attrgetter('chr'))

    # for parallel computing 
    print >>sys.stderr,"# Generating clusters for two parts..."
    # tuple of all parallel python servers to connect with
    ppservers = ()
    job_server = pp.Server(ncpus, ppservers=ppservers)
    jobs=[]
    for chro in chr_list:
        part_temp=filter(lambda p: p.chr==chro, part)
        if len(part_temp)>0:
            jobs.append(job_server.submit(cluster_regions,(part_temp,min_clusterS),(annotated_bed,),("UnionFind","copy",)))
        

    cluster_pool={}
    part=[]
    for job in jobs: 
        try:
            part=part+job()[1]
            cluster_pool.update(job()[0])
        except:
            print >> sys.stderr, "Wrong in %s, part1"%(job()[2])
            continue


    print >>sys.stderr,"   cluster number is %d             "%(len(cluster_pool))

    # sort back to pair two parts
    part=sorted(part, key=attrgetter('part'))
    part=sorted(part, key=attrgetter('id'))

    print >> sys.stderr,"size of part",len(part)

    c_interaction={}
    i=0
    while i<len(part):
        P1=part[i]
        P2=part[i+1]
        assert P1.id==P2.id
        i+=2
        print >> sys.stderr,"%d\r"%(i),
        if P1.cluster==P2.cluster: continue
        if P1.cluster<P2.cluster:
            inter=P1.cluster+"--"+P2.cluster
        else:
            inter=P2.cluster+"--"+P1.cluster
        if c_interaction.has_key(inter):
            c_interaction[inter]+=1
        else:
            c_interaction[inter]=1

    # annotation file
    print >> sys.stderr,"# Indexing annotation files"
    dbi_all=DBI.init(args.annotation,"bed")
    dbi_detail=DBI.init(args.db_detail,"bed")
    dbi_repeat=DBI.init("/home/yu68/bharat-interaction/new_lincRNA_data/mouse.repeat.txt","bed")


    print >> sys.stderr,"# finding strong interactions from clusters..."
    k=0 # record for strong interactions
    n=0

    # annotation file

    for interaction in c_interaction:
        n=n+1
        count=c_interaction[interaction]
        if count<min_interaction: continue
        i=interaction.split("--")[0]
        j=interaction.split("--")[1]
        try:  # we select clusters with size no less than 5, so some interactions cannot be found in clusters
            count1=cluster_pool[i].cluster
            count2=cluster_pool[j].cluster
        except:
            continue
        real_p=1-hypergeom.cdf(count,len(part)/2,count1,count2)
        if real_p<=p_value:
            k=k+1
            cluster_pool[i].Annotate(dbi_all,dbi_detail,dbi_repeat)
            cluster_pool[j].Annotate(dbi_all,dbi_detail,dbi_repeat)
            try:
                log_p = math.log(real_p)
            except:
                log_p = -float("Inf")
            print >> output,str(cluster_pool[i])+'\t'+str(cluster_pool[j])+'\t%d\t%.4f'%(count,log_p)
        if n%1000==0: print >> sys.stderr, "  Progress ( %d / %d )\r"%(n,len(c_interaction)),

    print >> sys.stderr,"# Find %d strong interactions. Cost time: %.2f s"%(k,time()-t1)
Ejemplo n.º 55
0
	def p_hypergeom(self,N_pop,n_chosen,K_pop,k_success):
		return hypergeom.cdf(k_success,N_pop,K_pop,n_chosen)
Ejemplo n.º 56
0
def plot_cell_enrichments(
    ds,
    f=None,
    enrichments=None,
    ax=None,
    title=None,
):
    LOGGER.info("Plotting cell type enrichments")

    if enrichments is None:
        enrichments = brs.enrichments.get_enrichments(
            list(ds.species)[0],
        )

    if f is None:
        f = {'p': .05, 'asym_fold': 1.25}

    if isinstance(f, dict):
        f = [f]

    if ax is None:
        _, ax = plt.subplots(figsize=(4, 3))

    cell_prots = {
        cell: [key for key, val in enrichments.items() if val == cell]
        for cell in brs.CELL_TYPES
    }

    display_name = {
        'Myelinating Oligodendrocytes': 'Oligodendrocytes',
    } if sum([
        'Oligo' in cell and len(cell_prots.get(cell, [])) > 0
        for cell in brs.CELL_TYPES
    ]) else {}

    vals = []

    ds = ds.filter(
        protein=set(j for i in cell_prots.values() for j in i),
        fn=lambda x: len(x['Proteins']) < 2,
    )
    hatches = [
        "",
        "//",
        "o",
        "x",
        ".",
    ]

    for cell in brs.CELL_TYPES:
        for ind, fil in enumerate(f):
            dc = ds.filter(protein=set(cell_prots[cell]))

            fore_hits = dc.filter(fil).shape[0]
            fore_size = dc.shape[0]

            back_hits = ds.filter(fil).shape[0]
            back_size = ds.shape[0]

            if fore_size < 1 or back_size < 1:
                continue

            val = (
                1 -
                hypergeom.cdf(fore_hits, back_size, back_hits, fore_size) +
                hypergeom.pmf(fore_hits, back_size, back_hits, fore_size)
            )
            vals.append(
                pd.Series(
                    OrderedDict([
                        ('cell', display_name.get(cell, cell)),
                        ('fore hits', fore_hits),
                        ('fore size', fore_size),
                        ('back hits', back_hits),
                        ('back size', back_size),
                        ('p-value', val),
                        ('-log10 p-value', -np.log10(val)),
                        ('color', brs.CELL_COLORS[cell]),
                        ('hatch', hatches[ind % len(hatches)]),
                        ('hue', format_title(f=fil)),
                    ])
                )
            )

    df = pd.DataFrame(vals)

    ax = sns.barplot(
        data=df,
        y='cell',
        x='-log10 p-value',
        hue='hue',
        ax=ax,
    )

    ax.axvline(-np.log10(.01), color='k', linestyle=':')
    ax.legend(
        handles=[
            mpatches.Patch(
                facecolor='w',
                edgecolor='k',
                hatch=i,
                label=df['hue'].iloc[ind],
            )
            for ind, i in enumerate(hatches[:len(f)])
        ]
    )

    for hatch, color, p in zip(
        df['hatch'],
        df['color'],
        sorted(ax.patches, key=lambda x: x.xy[1]),
    ):
        p.set_hatch(hatch)
        p.set_facecolor(color)
        p.set_edgecolor('k')

    if title:
        ax.set_title(title)

    ax.set_ylabel('')
    ax.set_xlabel('p-value')
    ax.set_xticklabels(['{:.3}'.format(10 ** -i) for i in ax.get_xticks()])

    return ax.get_figure(), ax
Ejemplo n.º 57
0
def Main():
    t1=time()
    args=ParseArg()
    inp = open(args.input, 'r')
    min_clusterS=args.min_clusterS
    min_interaction=args.min_interaction
    p_value=args.p_value
    output=open(args.output,'w')

    #store count of RNA for part1 and part2
    part1={}
    part2={}


    k=0
    
    print >> sys.stderr,"# Inputing data..."
    interaction = {}  # store number of interactions for different RNA

    Types = ["snoRNA","protein_coding","snRNA","lincRNA","tRNA","misc_RNA","pseudogene","miRNA","antisense","sense_intronic","non_coding","processed_transcript"]
    for line in inp.read().split('\n'):
        if line=='': continue
        line=line.strip().split('\t')
        p1=annotated_bed(line[0:8],id=k,cluster=1)
        p2=annotated_bed(line[9:],id=k,cluster=1)
        if SingleFragment(p1,p2): continue
        k+=1
        if p1.type in Types:
            p1_name = p1.chr+":"+p1.name
            if p1_name not in part1:
                part1[p1_name]=1
            else:
                part1[p1_name]+=1  
        if p2.type in Types:
            p2_name = p2.chr+":"+p2.name
            if p2_name not in part2:
                part2[p2_name]=1
            else:
                part2[p2_name]+=1
        if p1.type in Types and p2.type in Types:            
            inter_name = p1_name+"--"+p2_name
            if inter_name not in interaction:
                interaction[inter_name]=[copy.deepcopy(p1),copy.deepcopy(p2)]
            else:
                interaction[inter_name][0].Update(p1.start,p1.end)
                interaction[inter_name][1].Update(p2.start,p2.end)
                interaction[inter_name][0].cluster+=1
        if k%20000==0: 
            print >> sys.stderr,"  Reading %d pairs of segments\r"%(k),
    print >> sys.stderr,"Get total %d pairs."%(k)

    print >>sys.stderr,"   number of different RNAs for part1 is %d          "%(len(part1))
    print >>sys.stderr,"   number of different RNAs for part2 is %d          "%(len(part2))
    
    total = k # total pairs used
    n=0
    k=0  # record number of strong interactions
    for i in interaction:
        n+=1
        count = interaction[i][0].cluster
        if count < min_interaction: continue
        p1_name = i.split("--")[0]
        p2_name = i.split("--")[1]
        P1 = interaction[i][0]
        P2 = interaction[i][1]
        P1.cluster = part1[p1_name]
        P2.cluster = part2[p2_name]
        if part1[p1_name]<min_clusterS or part2[p2_name]<min_clusterS: continue
        real_p=1-hypergeom.cdf(count,total,part1[p1_name],part2[p2_name])
        if real_p<=p_value:
            k=k+1
            try:
                log_p = math.log(real_p)
            except:
                log_p = -float("Inf")
            print >> output, str(P1)+'\t'+str(P2)+'\t%d\t%.4f'%(count,log_p)
        if n%100==0: print >> sys.stderr, "  Progress ( %d / %d )\r"%(n,len(interaction)),
    print >> sys.stderr,"# Find %d strong interactions. Cost time: %.2f s"%(k,time()-t1)