def valid_covalue(data, parts): count = 0 total = 0 for i in range(0, len(parts) - 1): for j in range(i + 1, len(parts)): for partone in range(0, len(parts[i])): for parttwo in range(0, len(parts[j])): left_i = parts[i][partone] right_i = parts[j][parttwo] left = data[left_i].astype(bool) right = data[right_i].astype(bool) k = np.count_nonzero(np.bitwise_and(left, right)) prb = hypergeom.cdf(k, len(left), np.count_nonzero(left), np.count_nonzero(right)) if 1 - prb < 0.05: count += 1 total += 1 print 'false positive %d' % ((1.0 * count) / total) count = 0 total = 0 for i in range(0, len(parts)): for k in range(0, len(parts[i])): for j in range(k, len(parts[i])): left_i = parts[i][k] right_i = parts[i][j] left = data[left_i].astype(bool) right = data[right_i].astype(bool) k = np.count_nonzero(np.bitwise_and(left, right)) prb = hypergeom.cdf(k, len(left), np.count_nonzero(left), np.count_nonzero(right)) if 1 - prb > 0.05: count += 1 total += 1 print 'true positive %d' % (1.0 * count / total)
def get_dic_pvalue(dic_fasta, dic_control, ctrl_all, fasta_all): """ Calculate the p-value with an hypergeomtric test of the count of the hexanucleotide, di-nucleotide, codon, codon position nucleotides, amino acids, or others in a control set of exons and the exons in the fasta file :param dic_fasta:(dictionary of int) the count of the hexanucleotide, di-nucleotide, codon, codon position nucleotides, amino acids, or others in the fasta file given by the user :param dic_control:(dictionary of int) the count of the hexanucleotide, di-nucleotide, codon, codon position nucleotides, amino acids, or others in the control exons (exons CCE/ALL/ACE of fasterDB) :param ctrl_all: (int) the total number of the hexanucleotide, di-nucleotide, codon, codon position nucleotides, amino acids, or others in the control set of exons :param fasta_all:(int) the total number of the hexanucleotide, di-nucleotide, codon, codon position nucleotides, amino acids, or others in the fasta file :return: (dictionary of float) for each element give it's calculated p-value """ p_val = {} for key in dic_fasta.keys(): p1 = hypergeom.cdf(dic_fasta[key], ctrl_all, dic_control[key], fasta_all) p2 = 1 - hypergeom.cdf(dic_fasta[key], ctrl_all, dic_control[key], fasta_all) if p1 < p2: p_val[key] = p1 else: p_val[key] = p2 return p_val
def test_hypergeometric(): assert_almost_equal(hypergeometric(4, 10, 5, 6, 'greater'), 1 - hypergeom.cdf(3, 10, 5, 6)) assert_almost_equal(hypergeometric(4, 10, 5, 6, 'less'), hypergeom.cdf(4, 10, 5, 6)) assert_almost_equal(hypergeometric(4, 10, 5, 6, 'two-sided'), 2 * (1 - hypergeom.cdf(3, 10, 5, 6)))
def test_hypergeometric(): """Clopper-Pearson.""" np.testing.assert_almost_equal(hypergeometric(4, 10, 5, 6, 'greater'), 1 - hypergeom.cdf(3, 10, 5, 6)) np.testing.assert_almost_equal(hypergeometric(4, 10, 5, 6, 'less'), hypergeom.cdf(4, 10, 5, 6)) np.testing.assert_almost_equal(hypergeometric(4, 10, 5, 6, 'two-sided'), 2 * (1 - hypergeom.cdf(3, 10, 5, 6)))
def hypergeom_conf_interval(n, x, N, cl=0.975, alternative="two-sided", G=None, **kwargs): """ Confidence interval for a hypergeometric distribution parameter G, the number of good objects in a population in size N, based on the number x of good objects in a simple random sample of size n. Parameters ---------- n : int The number of draws without replacement. x : int The number of "good" objects in the sample. N : int The number of objects in the population. cl : float in (0, 1) The desired confidence level. alternative : {"two-sided", "lower", "upper"} Indicates the alternative hypothesis. G : int in [0, N] Starting point in search for confidence bounds for the hypergeometric parameter G. kwargs : dict Key word arguments Returns ------- tuple lower and upper confidence level with coverage (at least) 1-alpha. Notes ----- xtol : float Tolerance rtol : float Tolerance maxiter : int Maximum number of iterations. """ assert alternative in ("two-sided", "lower", "upper") if G is None: G = (x / n)*N ci_low = 0 ci_upp = N if alternative == 'two-sided': cl = 1 - (1-cl)/2 if alternative != "upper" and x > 0: f = lambda q: cl - hypergeom.cdf(x-1, N, q, n) ci_low = math.ceil(brentq(f, 0.0, G, *kwargs)) if alternative != "lower" and x < n: f = lambda q: hypergeom.cdf(x, N, q, n) - (1-cl) ci_upp = math.floor(brentq(f, G, N, *kwargs)) return ci_low, ci_upp
def hypergeom_conf_interval(n, x, N, cl=0.975, alternative="two-sided", G=None, **kwargs): """ Confidence interval for a hypergeometric distribution parameter G, the number of good objects in a population in size N, based on the number x of good objects in a simple random sample of size n. Parameters ---------- n : int The number of draws without replacement. x : int The number of "good" objects in the sample. N : int The number of objects in the population. cl : float in (0, 1) The desired confidence level. alternative : {"two-sided", "lower", "upper"} Indicates the alternative hypothesis. G : int in [0, N] Starting point in search for confidence bounds for the hypergeometric parameter G. kwargs : dict Key word arguments Returns ------- tuple lower and upper confidence level with coverage (at least) 1-alpha. Notes ----- xtol : float Tolerance rtol : float Tolerance maxiter : int Maximum number of iterations. """ assert alternative in ("two-sided", "lower", "upper") if G is None: G = (x / n) * N ci_low = 0 ci_upp = N if alternative == 'two-sided': cl = 1 - (1 - cl) / 2 if alternative != "upper" and x > 0: f = lambda q: cl - hypergeom.cdf(x - 1, N, q, n) ci_low = math.ceil(brentq(f, 0.0, G, *kwargs)) if alternative != "lower" and x < n: f = lambda q: hypergeom.cdf(x, N, q, n) - (1 - cl) ci_upp = math.floor(brentq(f, G, N, *kwargs)) return ci_low, ci_upp
def score(self, x_t: Union[np.ndarray, Any]) -> np.ndarray: """ Compute the test-statistic (FET) between the reference window(s) and test window. If a given test-window is not yet full then a test-statistic of np.nan is returned for that window. Parameters ---------- x_t A single instance. Returns ------- Estimated FET test statistics (1-p_val) between reference window and test windows. """ values = set(np.unique(x_t)) if not set(values).issubset(['0', '1', True, False]): raise ValueError( "The `x_t` data must consist of only (0,1)'s or (False,True)'s for the " "FETDriftOnline detector.") x_t = super()._preprocess_xt(x_t) self._update_state(x_t) stats = np.zeros((len(self.window_sizes), self.n_features), dtype=np.float32) for k, ws in enumerate(self.window_sizes): if self.t >= ws: sum_last_ws = np.sum(self.xs[-ws:, :], axis=0) # Perform FET with hypergeom.cdf (this is vectorised over features) if self.alternative == 'greater': p_vals = hypergeom.cdf(self.sum_ref, self.n + ws, self.sum_ref + sum_last_ws, self.n) else: p_vals = hypergeom.cdf(sum_last_ws, self.n + ws, self.sum_ref + sum_last_ws, ws) # Compute test stat and apply smoothing stats_k = 1 - p_vals for f in range(self.n_features): if len(self.test_stats) != 0 and not np.isnan( self.test_stats[-1, k, f]): stats_k[f] = (1 - self.lam) * self.test_stats[ -1, k, f] + self.lam * stats_k[f] stats[k, :] = stats_k else: stats[k, :] = np.nan return stats
def enrichment(geneList1, geneList2, allGenes): tmp = set(geneList1).intersection(geneList2) x = len(tmp) M = allGenes n = len(geneList1) N = len(geneList2) return [x,tmp,M,n,N,1-hypergeom.cdf(x, M, n, N)]
def _TEST_(block, exptotal, bkgtotal, pseudo_count): sigPoi = 0 sigHyp = 0 ind = block.index for i in block.index: lamda = block.loc[i, 'BkgHop_withBonus'] obs = block.loc[i, 'ExpHop_withBonus'] BkgHop = block.loc[i, 'BkgHop'] ExpHop = block.loc[i, 'ExpHop'] P_poisson = 1 - poisson.cdf(int(obs) - 1, lamda + pseudo_count) if int(BkgHop) < 100000000 and int(ExpHop) < 100000000: P_hyper = 1 - hypergeom.cdf(ExpHop - 1, (bkgtotal + exptotal), exptotal, (ExpHop + BkgHop)) else: P_hyper = '***' block.loc[i, 'BkgFraction'] = float(BkgHop / bkgtotal) block.loc[i, 'ExpFraction'] = float(ExpHop / exptotal) block.loc[i, 'P_Hyper'] = P_hyper block.loc[i, 'P_Poisson'] = P_poisson if P_poisson < alpha and lamda * obs != 0: sigPoi = sigPoi + 1 if P_hyper < alpha: sigHyp = sigHyp + 1 return block, sigPoi, sigHyp
def hypergeom_test(data, sort_fdr=True): p_value_list = [] ratio_in_study_list = [] ratio_in_pop_list = [] classes = [] hit_genes = [] hit_links = [] path_names, typeIIs, typeIs = list(), list(), list() for study_hitnumber, pop_number, pop_hitnumber, study_number, each_class, associated_diff_info, link, path_name, typeII, typeI in data: p_value = 1 - hypergeom.cdf(study_hitnumber - 1, pop_number, pop_hitnumber, study_number) ratio_in_study = str(study_hitnumber) + '/' + str(study_number) ratio_in_pop = str(pop_hitnumber) + '/' + str(pop_number) p_value_list.append(p_value) ratio_in_study_list.append(ratio_in_study) ratio_in_pop_list.append(ratio_in_pop) classes.append(each_class) hit_genes.append(associated_diff_info) hit_links.append(link) path_names.append(path_name) typeIIs.append(typeII) typeIs.append(typeI) q_value_list = multipletests(p_value_list, method='fdr_bh')[1] number = len(q_value_list) # print(number) databases = ['KEGG PATHWAY'] * number result = zip(path_names, databases, classes, ratio_in_study_list, ratio_in_pop_list, p_value_list, q_value_list, hit_genes, hit_links, typeIIs, typeIs) if sort_fdr: sorted_result = sorted(result, key=lambda x: (x[6], x[5])) else: sorted_result = sorted(result, key=lambda x: (x[5], x[6])) # num = sum([1 for x in p_value_list if x < 1]) # return sorted_result[0:num] return sorted_result
def co_test_single(self, i, j): left = self.data[i].astype(bool) right = self.data[j].astype(bool) k = np.count_nonzero(np.bitwise_and(left, right)) prb = hypergeom.cdf(k, len(left), np.count_nonzero(left), np.count_nonzero(right)) return 1 - prb
def test_hypergeom_cdf_lower(self): # check hypergeom cdf to return right value from scipy.stats import hypergeom h = HyperCI(n_pop=100, n_draw=20, k_s_obs=5) k_s = 30 res = hypergeom.cdf(h.k_s_obs, h.n_pop, k_s, h.n_draw) self.assertEqual(res, 0.4009887932548518)
def run_analysis(self, pvalue_cutoff: float = 0.05, method: str = "hyperg", limiter: int = 0) -> pd.DataFrame: results = [] population = self.pathway_data["population"] num_cpds = len(self.compound_list) for pathway in self.pathway_data["pathways"]: pth_info = self.pathway_data["pathways"][pathway] pth_name = pth_info["name"] dbids = list(pth_info["compounds"].keys()) pth_cpds = [pth_info["compounds"][x] for x in dbids] num_dbids = len(dbids) if num_dbids >= limiter: pathway_hits = self._check_if_in_pathway(pth_cpds) num_hits = len(pathway_hits) if method == "hyperg": p_value = 1 - hypergeom.cdf(num_hits - 1, population - num_dbids, num_cpds, num_dbids) else: _, p_value = fisher_exact( [[num_hits, num_cpds - num_hits], [ num_dbids - num_hits, ((population - num_dbids) - num_cpds) + num_hits ]]) in_pathway_str = self._generate_string(pathway_hits, dbids) importance = self._calc_cov(num_hits, num_dbids) results.append([ pathway, pth_name, num_hits, num_dbids, p_value, importance, in_pathway_str ]) results = pd.DataFrame(results, columns=[ "Pathway ID", "Pathway Name", "Hits", "Pathway Compounds", "p", "Coverage", "Identifiers" ]) results.set_index("Pathway ID", inplace=True) _, holm_p, _, _ = multipletests(results["p"].values, method="holm") results.insert(4, "Holm p", holm_p) results = results[results["Holm p"] <= pvalue_cutoff] results.sort_values("p", inplace=True) self.results = results
def Random_strongInteraction(part1, part2, cluster_pool1, cluster_pool2): global min_interaction, p_value ''' This is for counputing FDR using random permutation ''' c_interaction = {} for i in range(len(part1)): region1 = str(part1[i]) region2 = str(part2[i]) inter = "%d--%d" % (part1[i].cluster, part2[i].cluster) if c_interaction.has_key(inter): c_interaction[inter] += 1 else: c_interaction[inter] = 0 k = 0 # record for strong interactions n = 0 for interaction in c_interaction: n = n + 1 count = c_interaction[interaction] if count < min_interaction: continue i = int(interaction.split("--")[0]) j = int(interaction.split("--")[1]) try: # we select clusters with size no less than 5, so some interactions cannot be found in clusters count1 = cluster_pool1[i].cluster count2 = cluster_pool2[j].cluster except: continue real_p = 1 - hypergeom.cdf(count, len(part1), count1, count2) if real_p <= p_value: k = k + 1 return [n, k]
def hyperConfidence(db, antc, consq): total = db.shape[0] cxy = raMetricas.abSupp(db, antc, consq) cx = raMetricas.abSupp(db, antc) cy = raMetricas.abSupp(db, consq) result = hypergeom.cdf(k=cxy - 1, M=total, n=cy, N=cx) return result
def calc_hyp(node_list, cui_to_genes, N, Q): n = len(node_list) (assoc_count, assoc_genes) = get_assoc(node_list) assoc_analy = [] for (a, k) in assoc_count.items(): K = len(cui_to_genes[a]) prb = 1 - hypergeom.cdf(k, N, K, n) assoc_analy.append([a, k, K, prb]) # Q = 0.001 sort_assoc = sorted(assoc_analy, key=lambda x: (x[3], x[0])) m = len(sort_assoc) mhc_assoc = [] for (i, [a, k, K, prb]) in enumerate(sort_assoc): BH = (float(i + 1) / m) * Q # calculate Benjamini-Hochberg based on ranked data mhc_assoc.append([i + 1, a, k, K, prb, BH]) sig_assoc = [] for [rank, phen, assnet, assint, prb, BH] in mhc_assoc: if prb < BH and assint > 24: genes = sorted(assoc_genes[phen]) gene_str = ','.join(genes) phen_term = cui_to_phens[phen][ 0] # use the first phenotype as the descriptor sig_assoc.append( [rank, phen_term, phen, assnet, assint, prb, BH, gene_str]) elif prb > BH: break return sig_assoc
def Random_strongInteraction(part1,part2,cluster_pool1,cluster_pool2): global min_interaction, p_value ''' This is for counputing FDR using random permutation ''' c_interaction={} for i in range(len(part1)): region1=str(part1[i]) region2=str(part2[i]) inter="%d--%d"%(part1[i].cluster,part2[i].cluster) if c_interaction.has_key(inter): c_interaction[inter]+=1 else: c_interaction[inter]=0 k=0 # record for strong interactions n=0 for interaction in c_interaction: n=n+1 count=c_interaction[interaction] if count<min_interaction: continue i=int(interaction.split("--")[0]) j=int(interaction.split("--")[1]) try: # we select clusters with size no less than 5, so some interactions cannot be found in clusters count1=cluster_pool1[i].cluster count2=cluster_pool2[j].cluster except: continue real_p=1-hypergeom.cdf(count,len(part1),count1,count2) if real_p<=p_value: k=k+1 return [n,k]
def zeta(L, s, Nmutated, a): # zeta(L,s,Nmut,a)=P[H(L+Nmut,L-Nmut,s) >= a] # i.e. the tail of a hypergeometric distribution # in the manuscript, zeta(L,s,n,a) is F_n(a) for a given L and s # # see # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.hypergeom.html#scipy.stats.hypergeom # # hypergeom.cdf(k,M,n,N) # k = number of red balls drawn = a-1 (not a) # M = total number of balls = L+Nmutated # n = number of red balls = L-Nmutated # N = number of draws = s #if (showZetaCalls): # callStr = "zeta(%s,%s,%s,%s) = 1-hypergeom.cdf(%s,%s,%s,%s) = %.12f" \ # % (L,s,Nmutated,a, # a-1,L+Nmutated,L-Nmutated,s, # 1 - hypergeom.cdf(a-1,L+Nmutated,L-Nmutated,s)) # cacheKey = (L,s,Nmutated,a) # if (cacheKey in zeta_cache): callStr += " (from cache)" # print(callStr,file=stderr) if (useCache): cacheKey = (L, s, Nmutated, a) if (cacheKey in zeta_cache): return zeta_cache[cacheKey] p = 1 - hypergeom.cdf(a - 1, L + Nmutated, L - Nmutated, s) if (useCache): zeta_cache[cacheKey] = p return p
def pval(R, n_basepairs, q_size): t = R[:,0] pvals = np.zeros(len(t)) for i in range(0, len(t)): pvals[i] = hypergeom.cdf(t[i], n_basepairs, q_size, R[i,2]) return 1 - pvals
def specificities(freqMotifParDep): """ Calcule la spécificité du motif dans chaque département - entrée : dataframe contenant la fréquence du motif recherché par département - sortie : dictionnaire contenant pour chaque département la spécificité du motif """ freqTot = 31868064 freqTotParDep = pd.read_hdf('./static/freqByDep.hdf', 'freqTokensByDep') freqTotMotif = freqMotifParDep.sum().sum() df_freqTotMotif = pd.DataFrame(freqMotifParDep.sum(axis=1), columns=["0"]) # Calcul de la fréquence attendue du motif dans chaque département expectedCounts = df_freqTotMotif.dot(freqTotParDep) / freqTot specif = freqMotifParDep.copy() """ Pour chaque département, la spécificité du motif est calculée à partir de : - la fréquence du motif dans le département en question (à partir de freqMotifParDep) - la fréquence totale de tous les tokens (freqTot) - la fréquence totale du motif (freqTotMotif) - la fréquence totale de tous les tokens dans le département (à partir de freqTotParDep) """ for dep in freqMotifParDep.columns: if (freqMotifParDep.loc["freq", dep] < expectedCounts.loc["freq", dep]): specif.loc["freq", dep] = hypergeom.cdf(freqMotifParDep.loc["freq", dep], freqTot, freqTotMotif, freqTotParDep.transpose().loc[dep]) else: specif.loc["freq", dep] = 1 - hypergeom.cdf( freqMotifParDep.loc["freq", dep] - 1, freqTot, freqTotMotif, freqTotParDep.transpose().loc[dep]) specif = np.log10(specif) specif[freqMotifParDep >= expectedCounts] = -specif[freqMotifParDep >= expectedCounts] # Les valeurs qui ne sont pas entre -10 et 10 sont tronquées for dep in specif: specif.loc[specif[dep] > 10, dep] = 10 specif.loc[specif[dep] < -10, dep] = -10 specif.rename(index={"freq": "specif"}, inplace=True) specif = pd.DataFrame.to_dict(specif) return specif
def cHgPvl(x,M,n,N): """ x=randVar M=popSize n=totalSuccesses N=samplSize """ return 1-hypergeom.cdf(x,M,n,N)+hypergeom.pmf(x,M,n,N)
def _hypergeom_wrapper(self, x): from scipy.stats import hypergeom p = hypergeom.cdf(x['lonely triplets at pos'], x['Num Triplets at Gene'], x['lonely triplets at gen'], x['Num Triplets at Pos']) return p
def compHyperGemP(hits, bg_all, bg_hits, query_size): ''' Returns P(X>=occ);fold-enrichment ''' p = 1 - hypergeom.cdf(hits - 1, bg_all, bg_hits, query_size) # calculates P(X>=occ) fe = hits / ((bg_hits / bg_all) * query_size) return p, fe
def GO_enrichment(geneList, ontology, expressedGenes = None, printIt=False, pCut = 1000000, xRef = {}): lenAllGenes, lenTheseGenes = len(expressedGenes), len(geneList) pValues = defaultdict() nCmps = 0 for GOTerm, GOGenes in ontology.items(): inBoth = GOGenes['genes'].intersection(geneList) expressedGOGenes = GOGenes['genes'].intersection(expressedGenes) if len(inBoth) <= 3 or len(expressedGOGenes) < 5: pValues[GOTerm] = 'notest' continue pVal = (1.-hypergeom.cdf(len(inBoth), lenAllGenes, len(expressedGOGenes), lenTheseGenes)) if pVal < 0: pVal = 0 symbols = [] for ensg in inBoth: if ensg in xRef: symbols.append(xRef[ensg]) else: symbols.append(ensg) pValues[GOTerm] = (pVal, len(inBoth), len(expressedGOGenes), len(GOGenes['genes']), inBoth, symbols) for k, v in pValues.items(): try: pValues[k][0] = v * float(nCmps) #bonferroni correction except: pass import operator y = [] sorted_x = sorted(pValues.iteritems(), key=operator.itemgetter(1)) for k, v in sorted_x: if v == "notest": continue if not type(k) == str: continue try: if v[0] > pCut: continue if printIt: [k, "|".join(ontology[k]['name']), v[0], v[1], v[2], v[3], ",".join(v[4]), ",".join(v[5])] #print k, "|".join(ontology[k]['name']), "%.3e" %v[0], v[1], v[2], v[3], "|".join(v[3]) y.append([k, "|".join(ontology[k]['name']), v[0], v[1], v[2], v[3], ",".join(v[4]), ",".join(v[5])]) except: pass try: df = pd.DataFrame(y, columns=['GO Term ID', 'GO Term Description', 'Bonferroni-corrected Hypergeometric p-Value', 'N Genes in List and GO Category', 'N Expressed Genes in GO Category', 'N Genes in GO category', 'Ensembl Gene IDs in List', 'Gene symbols in List']) df.set_index('GO Term ID', inplace=True) except: df = pd.DataFrame(None, columns=['GO Term ID', 'GO Term Description', 'Bonferroni-corrected Hypergeometric p-Value', 'N Genes in List and GO Category', 'N Expressed Genes in GO Category', 'N Genes in GO category', 'Ensembl Gene IDs in List', 'Gene symbols in List']) return df
def get_pa(N, n, c, p): """ Get the probability of acceptance N = lot size n = sample size c = tolerable defective rate p = defective rate """ return hypergeom.cdf(c, N, N * p, n)
def summary_to_hypergeometric_pvals(summary): #sys.stderr.write("C1\n") hypergeometric_pvals = {} for dbk in dbs.keys(): #if dbk != 'jaspar': # continue for p in paddings: # reduce padding-specific tuples in summary[k][p] to find TF indices where there are overlaps tf_overlaps = {} ordered_tfs = summary[dbk]['ordered_tfs'] mapped_tfs = summary[dbk]['mapped_tfs'] totals = summary[dbk]['totals'] tups = summary[dbk][str(p)] for tup in tups: (probe, bitstring) = tup bits = list(bitstring) for idx, bit in enumerate(bits): if bit == '1': if ordered_tfs[idx] not in tf_overlaps: tf_overlaps[ordered_tfs[idx]] = 0 tf_overlaps[ordered_tfs[idx]] += 1 #print(tf_overlaps) # use these indices to count TF-overlap totals over set of probes, and then over all probes via summary[k]['totals'] all_overlaps = {} for tfk in tf_overlaps.keys(): idx = mapped_tfs[tfk] all_overlaps[tfk] = totals[idx] #print(all_overlaps) # for each TF, calculate a hypergeometric p-value for tfk in tf_overlaps.keys(): # number of probes ("balls") in probe-subset ("sample") that overlap TF (sample of "red balls") x = int(tf_overlaps[tfk]) # number of probes ("balls") in set ("urn") ("all balls, black and red") M = int(total_number_of_probes) # number of probes in set ("urn") that overlap TF (all "red balls") n = int(all_overlaps[tfk]) # number of probes in probe-subset ("sample") N = int(len(probes)) #print('\t'.join([tfk, str(x), str(M), str(n), str(N)])) pval = 1.0 - hypergeom.cdf(x, M, n, N) if pval == 0.0: pval = 2.2250738585072014e-308 ip = str(p) if dbk not in hypergeometric_pvals: hypergeometric_pvals[dbk] = {} if ip not in hypergeometric_pvals[dbk]: hypergeometric_pvals[dbk][ip] = [] hypergeometric_pvals[dbk][ip].append({ 'id': tfk, 'score': pval }) #sys.stderr.write("C2\n") return hypergeometric_pvals
def diff_of_tail_area_and_cl(self, k_s_x, lf='left'): """ Calculate left/right tail probability minus (confidence level)*0.5 for hypergeometric distribution. :param k_s_x: number of success in the population :param lf: left tail or right tail :return: when lf='left' it returns sum of density among [0:k_s_t+1] minus (confidence level) *0.5 when lf='right' it returns sum of density [k_s_t:n_pop+1] minus (confidence level) *0.5 """ if lf == 'left': # left tailを計算 return abs( hypergeom.cdf(self.k_s_obs, self.n_pop, k_s_x, self.n_draw) - self.cl * 0.5) elif lf == 'right': # right tailを計算 return abs(1 - hypergeom.cdf(self.k_s_obs - 1, self.n_pop, k_s_x, self.n_draw) - self.cl * 0.5) else: raise TypeError('lf must be "left" or "right"')
def hypergeom_function(white_balls_drawn, population, white_balls_in_population, total_balls_drawn): """ hypergeometric function for probability value @param white_balls_drawn -- associated gene in input genesubset @param population -- population (here use the offical home sapiens genes in NCBI) @param white_balls_in_population -- assoicated genes in population @param total_balls_drawn -- input genelist size """ prob = 1-hypergeom.cdf(white_balls_drawn-1, population, white_balls_in_population, total_balls_drawn) return prob
def _hypergeom_wrapper(self, x): from scipy.stats import hypergeom p = hypergeom.cdf( x["lonely triplets at pos"], x["Num Triplets at Gene"], x["lonely triplets at gen"], x["Num Triplets at Pos"], ) return p
def calculate_enrichment(genes_file, peaks_file, clusters, n_genes, working_dir, distance=None, report_entire_feature=False, bedtools_exe="bedtools"): """ Calculate enrichment for a single peak set and distance genes_file (str): path to BED file with all genes distance (int): distance to calculate enrichments at peaks_file (list): BED file containing the ChIP-seq peaks clusters (list): cluster files n_genes (int): total number of genes in the genes BED file report_entire_feature (bool): if True then run intersectBed with the -wa option (to report the entire feature, not just the overlap) bedtools_exe (str): 'bedtools' executable to use Returns tuple (pvalue,counts) i.e. col1 for p-val, col2 for number of genes) """ # Initialise result arrays pvalues = np.zeros([len(clusters)]) counts = np.zeros([len(clusters)]) # Get set of genes overlapping this peak set for this distance overlap_genome = get_overlapping_genes( genes_file, peaks_file, distance, working_dir=working_dir, report_entire_feature=report_entire_feature) # Find subsets of overlapping genes in each RNA-seq cluster # and calculate enrichments for i, cluster_file in enumerate(clusters): # Read cluster file genes_cls = set( np.loadtxt(cluster_file, delimiter='\t', ndmin=1, usecols=[0], dtype=np.str)) # No. of genes in current cluster (sample size) n = len(genes_cls) # Total number of overlapping genes (for set of all genes) K_i = len(overlap_genome) # Genes from the input regions based set, which are also in this cluster n_i = len(overlap_genome.intersection(genes_cls)) # Calculate and store enrichment from hypergeometric function pvalues[i] = max(MIN_PVALUE, 1.0 - hg.cdf(n_i - 1, n_genes, n, K_i)) counts[i] = n_i return (pvalues, counts)
def _one_fit(self): print("\nCreating downsampled doublets...") self._createDoublets() # Normalize combined augmented set print("Normalizing...") aug_counts = self.normalizer( np.append(self._raw_counts, self._raw_synthetics, axis=0)) self._norm_counts = aug_counts[:self._num_cells] self._synthetics = aug_counts[self._num_cells:] print("Running PCA...") # Get phenograph results pca = PCA(n_components=self.n_components) print("Clustering augmented data set with Phenograph...\n") reduced_counts = pca.fit_transform(aug_counts) fullcommunities, _, _ = phenograph.cluster( reduced_counts, **self.phenograph_parameters) min_ID = min(fullcommunities) self.communities_ = fullcommunities[:self._num_cells] self.synth_communities_ = fullcommunities[self._num_cells:] community_sizes = [ np.count_nonzero(fullcommunities == i) for i in np.unique(fullcommunities) ] print("Found communities [{0}, ... {2}], with sizes: {1}\n".format( min(fullcommunities), community_sizes, max(fullcommunities))) # Count number of fake doublets in each community and assign score # Number of synth/orig cells in each cluster. synth_cells_per_comm = collections.Counter(self.synth_communities_) orig_cells_per_comm = collections.Counter(self.communities_) community_IDs = orig_cells_per_comm.keys() community_scores = { i: float(synth_cells_per_comm[i]) / (synth_cells_per_comm[i] + orig_cells_per_comm[i]) for i in community_IDs } scores = np.array([community_scores[i] for i in self.communities_]) community_p_values = { i: hypergeom.cdf(synth_cells_per_comm[i], aug_counts.shape[0], self._synthetics.shape[0], synth_cells_per_comm[i] + orig_cells_per_comm[i]) for i in community_IDs } p_values = np.array([community_p_values[i] for i in self.communities_]) if min_ID < 0: scores[self.communities_ == -1] = np.nan p_values[self.communities_ == -1] = np.nan return scores, p_values
def hypergeom(self, white_balls_drawn, population, white_balls_in_population, total_balls_drawn): """ hypergeometric function for probability value @param white_balls_drawn -- associated gene in input genesubset @param population -- population (here use the offical home sapiens genes in NCBI) @param white_balls_in_population -- assoicated genes in population @param total_balls_drawn -- input genelist size """ prob = 1 - hypergeom.cdf(white_balls_drawn - 1, population, white_balls_in_population, total_balls_drawn) return prob
def specificities(lexicalTable,annotationType): from scipy.stats import hypergeom M=lexicalTable.sum().sum() lengths=pd.DataFrame(lexicalTable.sum()) freq=pd.DataFrame(lexicalTable.sum(axis=1)) expectedCounts=(freq.dot(lengths.transpose()))/M specif=lexicalTable.copy() for part in lexicalTable.columns: sys.stdout.write("\r5/6 - "+annotationType+" - Calcul des spécificités pour le département "+str(part)) for word in lexicalTable.index: if lexicalTable.loc[word,part]<expectedCounts.loc[word,part] : specif.loc[word,part]=hypergeom.cdf(lexicalTable.loc[word,part],M, freq.loc[word], lengths.loc[part]) else: specif.loc[word,part]=1-hypergeom.cdf(lexicalTable.loc[word,part]-1,M, freq.loc[word], lengths.loc[part]) specif=np.log10(specif) specif[lexicalTable>=expectedCounts]=-specif[lexicalTable>=expectedCounts] sys.stdout.write("\n") # si on veut des valeurs tronquées for dep in specif : specif.loc[specif[dep] > 10,dep] = 10 specif.loc[specif[dep] < -10,dep] = -10 return specif
def accumulative_hypergeometric(k, n, K, N): ''' [k]: SUCCESS IN THE CLUSTER [n]: SIZE OF THE CLUSTER [K]: SUCCESS IN POPULATION [N]: SIZE OF THE POPULATION ''' k, n, K, N = int(k), int(n), int(K), int(N) sf = hyp.sf(k, N, K, n) if sf < 1: return sf + hyp.pmf(k, N, K, n) else: return 1 - hyp.cdf(k, N, K, n) + hyp.pmf(k, N, K, n)
def hypergeometric(objects_in_bin, total_size, objects_total, bin_size): p_over = np.log10( hypergeom.sf(objects_in_bin - 1, total_size, objects_total, bin_size)) p_under = np.log10( hypergeom.cdf(objects_in_bin, total_size, objects_total, bin_size)) if p_over < p_under: p = -p_over else: p = p_under if abs(p) > 3: return p / (abs(p)) * 3 else: return p
def plotheatmap_hypertest(ann_heatmap, genes, groupby, filename, vmin=0, vmax=1.5, figsize=(23, 2)): from scipy.stats import hypergeom ann_heatmap = ann_heatmap.copy() ann_heatmap.X = ann_heatmap.raw.X ann_heatmap = ann_heatmap[:, ann_heatmap.var_names.isin(genes)].copy() ann_heatmap.var['ncells'] = (ann_heatmap.X > 0).sum(axis=0).tolist()[0] records = {} htmap_df = pd.DataFrame(columns=ann_heatmap.var_names) #nn_heatmap. M = len(ann_heatmap.obs) for grp in ann_heatmap.obs[groupby].unique(): test_group = ann_heatmap[ann_heatmap.obs[groupby] == grp].copy() test_group.var['ncells'] = (test_group.X > 0).sum(axis=0).tolist()[0] N = len(test_group.obs) records[grp] = [] for g in genes: n = ann_heatmap.var.loc[g]['ncells'] x = test_group.var.loc[g]['ncells'] # add a small value to avoid 0 as a pvalue hyper_test_pval = 1 - hypergeom.cdf(x, M, n, N) + 10e-30 records[grp].append(hyper_test_pval) htmap_df = pd.DataFrame.from_dict(records, orient='index', columns=genes) htmap_df = htmap_df.sort_index() f, ax = plt.subplots(1, 1, figsize=figsize) sns.heatmap( htmap_df.applymap(lambda x: -np.log10(x)), square=False, cmap=sns.light_palette('red', as_cmap=True), vmin=vmin, vmax=vmax, linewidths=.5, cbar_kws=dict(use_gridspec=False, aspect=8, label='$-log(p_{val})$', anchor=(-0.3, 0.0)), ax=ax, ) fn = filename f.savefig(fn + '.pdf', bbox_inches='tight', dpi=300) plt.close(f) return htmap_df
def compare_cols(fg_col, fg_cons, fg_size, fg_weights, bg_col, bg_cons, bg_size, bg_weights, aa_freqs, pseudo_size): "Compare alignments using the hypergeometric model" # Number of consensus-type residues in the foreground column fg_cons_count = count_col(fg_col, fg_weights)[fg_cons] # Consensus residue frequency in the combined alignment column p_j = count_col(bg_col, bg_weights)[fg_cons] + fg_cons_count # Round fg counts & size to nearest integer for hypergeometric test fg_cons_count_i = max(1, int(ceil(fg_cons_count))) fg_size_i = int(ceil(fg_size)) bg_size_i = int(ceil(bg_size)) # Probability of fg col conservation vs. the combined/main set pvalue = 1.0 - hypergeom.cdf(fg_cons_count_i - 1, fg_size_i + bg_size_i, p_j, fg_size_i) return pvalue
def doHyperG(genelist, allgenes, allterms, assocname): geneswithterms = allgenes.keys() termswithgenes = allterms.keys() M=len(geneswithterms) N=len(list(set(geneswithterms).intersection(set(genelist)))) pvalues=[] termsingenelist=[] termsinbackground=[] termname=[] for t in termswithgenes: n = len(allterms[t]) x = len(list(set(allterms[t]).intersection(set(genelist)))) if x == 0: continue pvalue = 1.0 - hypergeom.cdf(x,M,n,N) pvalues.append(pvalue) termsingenelist.append(x) termsinbackground.append(n) termname.append(t) adjpvalue = list(fdrcorrection0(pvalues)[1]) print("\t".join(["Term annotation", "pvalue", "fdr adj pvalue","Background","Expected","GeneList","Observed","Genes"])) for u in range(0,len(adjpvalue)): gotermname = termname[u] if termname[u] in assocname.keys(): gotermname = assocname[termname[u]] print("\t".join([gotermname, str(pvalues[u]), str(adjpvalue[u]), str(M), str(termsinbackground[u]), str(N), str(termsingenelist[u]), ",".join(list(set(allterms[termname[u]]).intersection(set(genelist))))] ) )
def hypergeometric_test(x, M, n, N): """ The hypergeometric distribution models drawing objects from a bin. - M is total number of objects - n is total number of Type I objects. - x (random variate) represents the number of Type I objects in N drawn without replacement from the total population - http://en.wikipedia.org/wiki/Hypergeometric_distribution - https://www.biostars.org/p/66729/ - http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.hypergeom.html - http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.hypergeometric.html - http://stackoverflow.com/questions/6594840/what-are-equivalents-to-rs-phyper-function-in-python """ assert n <= M assert x <= n assert N <= M pv_le = hypergeom.cdf(x+1, M, n, N) pv_gt = hypergeom.sf(x-1, M, n, N)# 1-cdf sometimes more accurate return pv_le, pv_gt
def _TEST_ (block,exptotal,bkgtotal,pseudo_count): sigPoi=0 sigHyp=0 ind=block.index for i in block.index: lamda=block.loc[i,'BkgHop_withBonus'] obs=block.loc[i,'ExpHop_withBonus'] BkgHop=block.loc[i,'BkgHop'] ExpHop=block.loc[i,'ExpHop'] P_poisson=1-poisson.cdf(int(obs)-1,lamda+pseudo_count) if int(BkgHop) < 100000000 and int(ExpHop) < 100000000: P_hyper=1-hypergeom.cdf(ExpHop-1,(bkgtotal+exptotal),exptotal, (ExpHop+BkgHop)) else: P_hyper='***' block.loc[i,'BkgFraction']=float(BkgHop/bkgtotal) block.loc[i,'ExpFraction']=float(ExpHop/exptotal) block.loc[i,'P_Hyper']= P_hyper block.loc[i,'P_Poisson']=P_poisson if P_poisson < alpha and lamda*obs != 0: sigPoi=sigPoi+1 if P_hyper < alpha: sigHyp=sigHyp+1 return block, sigPoi, sigHyp
def mutual_exclusivity(array_a, array_b, verbose=1): """ Performs a hypergeometric test assessing the probability of array_a and array_b are mutual exclusive. Arguements: array_a, array_b: boolean array - discretised values Returns: p-value: float """ a, b = np.array(array_a), np.array(array_b) x = np.sum(np.bitwise_and(a == False, b == True)) - 1 M = len(a) n = np.sum(a == False) N = np.sum(b == True) if verbose > 0: print '[INFO] x, M, n, N: ', x, M, n, N return 1.0 - hypergeom.cdf(x, M, n, N)
control.append(get_control(path)) sys.stderr.write("%d blocks were read from\t%s\n" % (len(control[-1]), path)) counts, total = count_conservation(signal, iid2seed, matches); r_control = [] for maf_control in control: cc,tc = count_control(maf_control, seeds, matches) r_control.append((cc,tc)) for k,v in Counter(total).most_common(10): name = mir_lib.shortname(seed2mirs[k]) a = [name, counts[k], v, counts[k]/float(v)] for cc,tc in r_control: rawp = (1 - hypergeom.cdf(counts[k], tc[k] ,cc[k] , v)) p_value = "%.2e" % rawp; a += [cc[k], tc[k], cc[k]/(tc[k] + 0.1), p_value] print "\t".join([str(x) for x in a]) #micros = parsing.getDictFromFa(args.micro); #micros = parsing.to_upper_case(micros); #for key, value in micros.iteritems(): #seed_dict[reverse_complement(value[1:7])].append(key); #counts_true, total_true = count_conservation(maf_dict, micros) #seeds = set([reverse_complement(x[1:7]) for x in micros.values()]) #counts_control, total_control = count_control(maf_control, seeds)
def enrichment_test(the_present_set, the_absent_set, the_grouping_dict, **kwargs): """ Hypergeometric testing for enrichment. Arguments: the_present_set: a list or set of items that were tested and were positive the_absent_set: a list or set of items that were tested for but were not present / significant the_grouping_dict: a dict of items to check for enrichment. key: grouping name, e.g. GO enrichment category, complex name, etc... value: a list or set of items in the list to test, e.g. gene or protein id's kwargs: filter_items: [True (default) / False] Whether to filter the items in the_grouping_dict and the_present_set / the_absent_set so only items in common are tested. verbose: [False(default) / True] method: type of mt testing to apply, default is "none", see mtcorrect() for more information direction: ["enrichment" (default), "depletion"] essentially, whether to look at the right or left tail returns: test_result_dict: a dict with: {the_grouping: {'p': the corrected p-value, 'n_present': number in the grouping that were present, 'n_group': the total size of the group} """ from copy import deepcopy from scipy.stats import hypergeom method = test_kwarg('method', kwargs, mtcorrect_methods) filter_items = test_kwarg('filter_items', kwargs, [True, False]) verbose = test_kwarg('verbose', kwargs, [False, True]) method = test_kwarg('method', kwargs, mtcorrect_methods) direction = test_kwarg('direction', kwargs, ["enrichment", "depletion"]) the_present_set = set(the_present_set) the_absent_set = set(the_absent_set) all_test_items_set = the_present_set | the_absent_set the_grouping_dict = deepcopy(the_grouping_dict) all_grouping_items_set = set([]) the_groupings_to_test = deepcopy(the_grouping_dict.keys()) for the_grouping in the_groupings_to_test: the_grouping_dict[the_grouping] = set(the_grouping_dict[the_grouping]) if filter_items: the_grouping_dict[the_grouping] = the_grouping_dict[the_grouping] & all_test_items_set all_grouping_items_set.update(the_grouping_dict[the_grouping]) if len(the_grouping_dict[the_grouping]) == 0: the_grouping_dict.pop(the_grouping) if filter_items: the_present_set = the_present_set & all_grouping_items_set the_absent_set = the_absent_set & all_grouping_items_set all_test_items_set = all_test_items_set & all_grouping_items_set test_result_dict = {} n_present_dict = {} group_size_dict = {} for the_grouping in the_grouping_dict.keys(): # x: number of present items in the grouping # M: total number of items # n: total number of present items # N: total number of items in the grouping the_grouping_set = the_grouping_dict[the_grouping] x = len(the_present_set & the_grouping_set) M = len(all_test_items_set) n = len(the_present_set) N = len(the_grouping_set) if direction == "enrichment": # We want the probability that x or more than x can be chosen randomly, # so we must subtract 1 if x >= 1: the_p = hypergeom.sf(x-1,M,n,N,loc=0) else: the_p = 1. else: the_p = hypergeom.cdf(x,M,n,N,loc=0) test_result_dict[the_grouping] = the_p n_present_dict[the_grouping] = x group_size_dict[the_grouping] = N corrected_p_dict = mtcorrect(test_result_dict, method = method) final_result_dict = {} for the_grouping in the_grouping_dict.keys(): final_result_dict[the_grouping] = {} final_result_dict[the_grouping]['n_present'] = n_present_dict[the_grouping] final_result_dict[the_grouping]['n_group'] = group_size_dict[the_grouping] final_result_dict[the_grouping]['p'] = corrected_p_dict[the_grouping] return final_result_dict
def Main(): t1=time() args=ParseArg() inp = open(args.input, 'r') min_clusterS=args.min_clusterS min_interaction=args.min_interaction p_value=args.p_value output=open(args.output,'w') outputIntra = open(args.output_intra, 'w') hasAnnotation = False if args.annotation: dbi = DBI.init(args.annotation, "bed") hasAnnotation = True else: dbi = False if args.annotation_repeat: dbirepeat = DBI.init(args.annotation_repeat, "bed") hasAnnotationRepeat = True else: dbirepeat = False #store count of RNA for part1 and part2 part={} k=0 sgcount = 0 #single fragment count print >> sys.stderr,"# Inputing data..." interaction = {} # store number of interactions for different RNA selfinteraction = {} #Types = ["snoRNA","protein_coding","snRNA","lincRNA","tRNA","misc_RNA","pseudogene","miRNA","antisense","sense_intronic","non_coding","processed_transcript","sense_overlapping","rRNA_repeat","rRNA"] for line in inp.read().split('\n'): if line=='': continue line=line.strip().split('\t') p1=annotated_bed_proper(line[0:10],id=k,cluster=1) p2=annotated_bed_proper(line[11:],id=k,cluster=1) if isinstance(p1.start, list): p1.start=int(p1.start[0]) p1.end=int(p1.end[-1]) if isinstance(p2.start, list): p2.start=int(p2.start[0]) p2.end=int(p2.end[-1]) if SingleFragment(p1,p2): sgcount += 1 continue k+=1 #if p1.subtype=="intron" or p2.subtype=="intron": continue #if p1.type in Types: try: p1_name = GetAnnotationName(p1, hasAnnotation, dbi, hasAnnotationRepeat, dbirepeat) if p1_name not in part: part[p1_name]=1 else: part[p1_name]+=1 #if p2.type in Types: p2_name = GetAnnotationName(p2, hasAnnotation, dbi, hasAnnotationRepeat, dbirepeat) if not p1_name == p2_name: # count once for self-interaction if p2_name not in part: part[p2_name]=1 else: part[p2_name]+=1 #if p1.type in Types and p2.type in Types: if p1_name == p2_name: if p1_name not in selfinteraction: selfinteraction[p1_name]=copy.deepcopy(p1) else: selfinteraction[p1_name].Update(p1.start, p1.end) selfinteraction[p1_name].Update(p2.start, p2.end) selfinteraction[p1_name].cluster += 1 else: if p1_name>p2_name: temp = p1 p1 = p2 p2 = temp tempName = p1_name p1_name = p2_name p2_name = tempName inter_name = p1_name + "--" + p2_name if inter_name not in interaction: interaction[inter_name]=[copy.deepcopy(p1),copy.deepcopy(p2)] else: interaction[inter_name][0].Update(p1.start,p1.end) interaction[inter_name][1].Update(p2.start,p2.end) interaction[inter_name][0].cluster+=1 except Exception as e: print >> sys.stderr, e if k%20000==0: print >> sys.stderr," Reading %d pairs of segments\r"%(k), print >> sys.stdout,"Get total %d pairs."%(k) print >> sys.stdout,"Single fragment count: %d."%(sgcount) print >>sys.stdout," number of different RNAs is %d "%(len(part)) total = k # total pairs used n=0 k=0 # record number of strong interactions for i in interaction: n+=1 count = interaction[i][0].cluster if count < min_interaction: continue p1_name = i.split("--")[0] p2_name = i.split("--")[1] P1 = interaction[i][0] P2 = interaction[i][1] P1.cluster = part[p1_name] P2.cluster = part[p2_name] if part[p1_name]<min_clusterS or part[p2_name]<min_clusterS: continue real_p=1-hypergeom.cdf(count,total,part[p1_name],part[p2_name]) if real_p<=p_value: k=k+1 try: log_p = math.log(real_p) except: log_p = -float("Inf") print >> output, str(P1)+'\t'+str(P2)+'\t%d\t%.4f'%(count,log_p) if n%500==0: print >> sys.stderr, " Progress ( %d / %d )\r"%(n,len(interaction)), k1=0 for i in selfinteraction: n+=1 count = selfinteraction[i].cluster if count < min_interaction: continue p1_name = i P1 = selfinteraction[i] P1.cluster = part[p1_name] if part[p1_name]<min_clusterS: continue k1=k1+1 print >> outputIntra, str(P1)+'\t%d'%(count) if n%500==0: print >> sys.stderr, " Progress ( %d / %d )\r"%(n,len(interaction)), print >> sys.stdout,"# Find %d strong and %d self interactions. Cost time: %.2f s"%(k,k1,time()-t1)
def Main(): t1=time() global min_interaction, p_value args=ParseArg() inp = open(args.input, 'r') min_clusterS=args.min_clusterS min_interaction=args.min_interaction p_value=args.p_value output=open(args.output,'w') ncpus=args.parallel #store genomic location of part1 and part2 part1=[] part2=[] k=0 print >> sys.stderr,"# Inputing data..." chr_list=[] for line in inp.read().split('\n'): if line=='': continue line=line.strip().split('\t') p1=annotated_bed(line[0:10],id=k) p2=annotated_bed(line[11:],id=k) if isinstance(p1.start, list): p1.start=int(p1.start[0]) p1.end=int(p1.end[-1]) if isinstance(p2.start, list): p2.start=int(p2.start[0]) p2.end=int(p2.end[-1]) if SingleFragment(p1,p2): continue k+=1 part1.append(p1) part2.append(p2) if p1.chr not in chr_list: chr_list.append(p1.chr) if p2.chr not in chr_list: chr_list.append(p2.chr) if k%20000==0: print >> sys.stderr," Reading %d pairs of segments\r"%(k), print >> sys.stderr,"Get total %d pairs."%(k) if len(part1)!=len(part2): print >> sys.stderr, "## ERROR: number of regions in two part not match!!" sys.exit(0) # sort in genomic order, easy for clustering part1=sorted(part1, key=attrgetter('start')) part1=sorted(part1, key=attrgetter('chr')) part2=sorted(part2, key=attrgetter('start')) part2=sorted(part2, key=attrgetter('chr')) # for parallel computing print >>sys.stderr,"# Generating clusters for two parts..." # tuple of all parallel python servers to connect with ppservers = () job_server = pp.Server(ncpus, ppservers=ppservers) jobs1=[] jobs2=[] for chro in chr_list: part1_temp=filter(lambda p: p.chr==chro, part1) if len(part1_temp)>0: jobs1.append(job_server.submit(cluster_regions,(part1_temp,min_clusterS),(annotated_bed,),("UnionFind","copy",))) part2_temp=filter(lambda p: p.chr==chro, part2) if len(part2_temp)>0: jobs2.append(job_server.submit(cluster_regions,(part2_temp,min_clusterS),(annotated_bed,),("UnionFind","copy",))) cluster_pool1={} part1=[] for job in jobs1: try: part1=part1+job()[1] cluster_pool1.update(job()[0]) except: print >> sys.stderr, "Wrong in %s, part1"%(job()[2]) continue cluster_pool2={} part2=[] for job in jobs2: try: part2=part2+job()[1] cluster_pool2.update(job()[0]) except: continue print >>sys.stderr," cluster number for part1 is %d "%(len(cluster_pool1)) print >>sys.stderr," cluster number for part2 is %d "%(len(cluster_pool2)) # sort back to pair two parts part1=sorted(part1, key=attrgetter('id')) part2=sorted(part2, key=attrgetter('id')) print >> sys.stderr,"size of part1&2:",len(part1),len(part2) c_interaction={} for i in range(len(part1)): region1=str(part1[i]) region2=str(part2[i]) try: inter=part1[i].cluster+"--"+part2[i].cluster except: print >> sys.stderr,i,part1[i].cluster,part2[i].cluster sys.exit() if c_interaction.has_key(inter): c_interaction[inter]+=1 else: c_interaction[inter]=1 # annotation file print >> sys.stderr,"# Indexing annotation files" dbi_all=DBI.init(args.annotation,"bed") dbi_detail=DBI.init(args.db_detail,"bed") dbi_repeat=DBI.init("/home/yu68/bharat-interaction/new_lincRNA_data/mouse.repeat.txt","bed") print >> sys.stderr,"# finding strong interactions from clusters..." k=0 # record for strong interactions n=0 # annotation file for interaction in c_interaction: n=n+1 count=c_interaction[interaction] if count<min_interaction: continue i=interaction.split("--")[0] j=interaction.split("--")[1] try: # we select clusters with size no less than 5, so some interactions cannot be found in clusters count1=cluster_pool1[i].cluster count2=cluster_pool2[j].cluster except: continue real_p=1-hypergeom.cdf(count,len(part1),count1,count2) if real_p<=p_value: k=k+1 cluster_pool1[i].Annotate(dbi_all,dbi_detail,dbi_repeat) cluster_pool2[j].Annotate(dbi_all,dbi_detail,dbi_repeat) try: log_p = math.log(real_p) except: log_p = -float("Inf") print >> output,str(cluster_pool1[i])+'\t'+str(cluster_pool2[j])+'\t%d\t%.4f'%(count,log_p) if n%1000==0: print >> sys.stderr, " Progress ( %d / %d )\r"%(n,len(c_interaction)), print >> sys.stderr,"# Find %d strong interactions. Cost time: %.2f s"%(k,time()-t1) if args.FDR: print >> sys.stderr, "# Permutated results:" for i in range(10): shuffle(part2) [n_r_I,n_r_SI]=Random_strongInteraction(part1,part2,cluster_pool1,cluster_pool2) print >> sys.stderr, " ",i, n_r_I, n_r_SI, n_r_SI*1.0/n_r_I
def enrichment_hypergeo( termList, entityList, species, useIea=True, asGenes=True, aspect="biological_process", verbose=True ): """ termList -- are the terms to be tested species -- an ncbi taxa id entityList -- gene or uniprot ids What is the probability of finding a given number of terms if we randomly select N out of M objects? M -- genes with at least one annotation N -- number of draws or size of gene list k -- the number of genes annotated by a given term (total type I objects) x -- number of times we observe a term in the gene list (draws) in R the cdf can be obtained with phyper(x,k,M-k,N) hypergeom.pmf(x, M, k, N) Returns a dict where term id is the key and hypergeo pvalue is the value """ ## connect to db and get annotations for the species session, engine = db_connect() geneAnnots, uniprotAnnots = fetch_taxa_annotations([species], engine, useIea=useIea, verbose=verbose, aspect=aspect) if asGenes == True: entity2go = geneAnnots else: entity2go = uniprotAnnots go2entity = {} for entity, go in entity2go.iteritems(): for term in go: if not go2entity.has_key(term): go2entity[term] = set([]) go2entity[term].update([entity]) for go, entity in go2entity.iteritems(): go2entity[go] = list(entity) print ("total go terms - %s" % (len(go2entity.keys()))) print ("total entities - %s" % (len(entity2go.keys()))) ## set variables M = len(entity2go.keys()) N = len(entityList) results = {} for testTerm in termList: ## find k = len(go2entity[testTerm]) x = 0 for entity in entityList: if entity in entity2go and testTerm in entity2go[entity]: x += 1 ## get a p-value if 0 in [x, M, N, k]: pvalue = np.nan else: cdf = hypergeom.cdf(x, M, k, N, loc=0) if cdf > 0: pvalue = 2 * (1 - hypergeom.cdf(x, M, k, N)) else: pvalue = 2 * hypergeom.cdf(x, M, k, N) results[testTerm] = pvalue return results
def Main(): t1=time() global min_interaction, p_value args=ParseArg() inp = open(args.input, 'r') min_clusterS=args.min_clusterS min_interaction=args.min_interaction p_value=args.p_value output=open(args.output,'w') #store genomic location of part1 and part2 part1=[] part2=[] k=0 print >> sys.stderr,"# Inputing data..." for line in inp.read().split('\n'): if line=='': continue line=line.strip().split('\t') k=k+1 part1.append(annotated_bed(line[0:7],id=k)) part2.append(annotated_bed(line[8:],id=k)) if k%20000==0: print >> sys.stderr," Reading %d pairs of segments\r"%(k), print >> sys.stderr,"Get total %d pairs."%(k) if len(part1)!=len(part2): print >> sys.stderr, "## ERROR: number of regions in two part not match!!" sys.exit(0) # sort in genomic order, easy for clustering part1=sorted(part1, key=attrgetter('start')) part1=sorted(part1, key=attrgetter('chr')) part2=sorted(part2, key=attrgetter('start')) part2=sorted(part2, key=attrgetter('chr')) print >>sys.stderr,"# Generating clusters for two parts..." print >>sys.stderr," Part1:" cluster_pool1=cluster_regions(part1,min_clusterS) print >>sys.stderr," cluster number for part1 is %d "%(len(cluster_pool1)) print >>sys.stderr," Part2:" cluster_pool2=cluster_regions(part2,min_clusterS) print >>sys.stderr," cluster number for part2 is %d "%(len(cluster_pool2)) # sort back to pair two parts part1=sorted(part1, key=attrgetter('id')) part2=sorted(part2, key=attrgetter('id')) c_interaction={} for i in range(len(part1)): region1=str(part1[i]) region2=str(part2[i]) inter="%d--%d"%(part1[i].cluster,part2[i].cluster) if c_interaction.has_key(inter): c_interaction[inter]+=1 else: c_interaction[inter]=0 print >> sys.stderr,"# finding strong interactions from clusters..." k=0 # record for strong interactions n=0 for interaction in c_interaction: n=n+1 count=c_interaction[interaction] if count<min_interaction: continue i=int(interaction.split("--")[0]) j=int(interaction.split("--")[1]) try: # we select clusters with size no less than 5, so some interactions cannot be found in clusters count1=cluster_pool1[i].cluster count2=cluster_pool2[j].cluster except: continue real_p=1-hypergeom.cdf(count,len(part1),count1,count2) if real_p<=p_value: k=k+1 print >> output,str(cluster_pool1[i])+'\t'+str(cluster_pool2[j])+'\t%d\t%.5f'%(count,real_p) if n%1000==0: print >> sys.stderr, " Progress ( %d / %d )\r"%(n,len(c_interaction)), print >> sys.stderr,"# Find %d strong interactions. Cost time: %.2f s"%(k,time()-t1) if args.FDR: print >> sys.stderr, "# Permutated results:" for i in range(10): shuffle(part2) [n_r_I,n_r_SI]=Random_strongInteraction(part1,part2,cluster_pool1,cluster_pool2) print >> sys.stderr, " ",i, n_r_I, n_r_SI, n_r_SI*1.0/n_r_I
rxnpvals = np.array(rxnpvals) recon2subs = np.array(recon2subs) rxnrangediffs = np.array(rxnrangediffs) uniqSubs = np.unique(recon2subs) numrxnspos = [] numsigrxnspos = [] hypergeomparrpos = [] for sub in uniqSubs: N = len(recon2rxns) M = sum(recon2subs==sub) K = sum(np.logical_and(rxnpvals<.05,rxnrangediffs>0)) x = sum(np.logical_and(np.logical_and(rxnpvals<.05,rxnrangediffs>0),recon2subs==sub)) numrxnspos.append(M) numsigrxnspos.append(x) hypergeomparrpos.append(1-hypergeom.cdf(x-1,N,M,K)) numrxnsneg = [] numsigrxnsneg = [] hypergeomparrneg = [] for sub in uniqSubs: N = len(recon2rxns) M = sum(recon2subs==sub) K = sum(np.logical_and(rxnpvals<.05,rxnrangediffs<0)) x = sum(np.logical_and(np.logical_and(rxnpvals<.05,rxnrangediffs<0),recon2subs==sub)) numrxnsneg.append(M) numsigrxnsneg.append(x) hypergeomparrneg.append(1-hypergeom.cdf(x-1,N,M,K)) labelarr = [] valuesarr = [] youngvals = totalvar['young'].values() for i in range(len(youngvals)):
def _motif_sig(fore_hits, fore_size, back_hits, back_size): return ( 1 - hypergeom.cdf(fore_hits, back_size, back_hits, fore_size) + hypergeom.pmf(fore_hits, back_size, back_hits, fore_size) )
def cphyper(k, K, n, N): return 1.0 - hypergeom.cdf(k - 1, N, n, K)
import matplotlib.pyplot as plt from scipy.stats import hypergeom, rv_discrete import numpy as np numargs = hypergeom.numargs #[ M, n, N ] = [100, 10, -1] #Display frozen pmf: rv = hypergeom( 10, 20, 3 ) print rv.dist.b x = np.arange( 0, np.min( rv.dist.b, 3 ) + 1 ) h = plt.plot( x, rv.pmf( x ) ) exit() #Check accuracy of cdf and ppf: prb = hypergeom.cdf( x, M, n, N ) h = plt.semilogy( np.abs( x - hypergeom.ppf( prb, M, n, N ) ) + 1e-20 ) #Random number generation: R = hypergeom.rvs( M, n, N, size=100 ) #Custom made discrete distribution: vals = [np.arange( 7 ), ( 0.1, 0.2, 0.3, 0.1, 0.1, 0.1, 0.1 )] custm = rv_discrete( name='custm', values=vals ) h = plt.plot( vals[0], custm.pmf( vals[0] ) )
def fisher_exact(c) : """Performs a Fisher exact test on a 2x2 contingency table. Parameters ---------- c : array_like of ints A 2x2 contingency table. Returns ------- oddsratio : float This is prior odds ratio and not a posterior estimate. p-value : float P-value for 2-sided hypothesis of independence. Examples -------- >>> fisher_exact([[100, 2], [1000, 5]]) (0.25, 0.13007593634330314) """ c = np.asarray(c, dtype=np.int64) # int32 is not enough for the algorithm odssratio = c[0,0] * c[1,1] / float(c[1,0] * c[0,1]) \ if (c[1,0] > 0 and c[0,1] > 0) else np.inf n1 = c[0,0] + c[0,1] n2 = c[1,0] + c[1,1] n = c[0,0] + c[1,0] mode = int(float((n + 1) * (n1 + 1)) / (n1 + n2 + 2)) pexact = hypergeom.pmf(c[0,0], n1 + n2, n1, n) pmode = hypergeom.pmf(mode, n1 + n2, n1, n) epsilon = 1 - 1e-4 if float(np.abs(pexact - pmode)) / np.abs(np.max(pexact, pmode)) <= 1 - epsilon: return odssratio, 1 elif c[0,0] < mode: plower = hypergeom.cdf(c[0,0], n1 + n2, n1, n) if hypergeom.pmf(n, n1 + n2, n1, n) > pexact / epsilon: return odssratio, plower # Binary search for where to begin upper half. min = mode max = n guess = -1 while max - min > 1: guess = max if max == min + 1 and guess == min else (max + min) / 2 pguess = hypergeom.pmf(guess, n1 + n2, n1, n) if pguess <= pexact and hypergeom.pmf(guess - 1, n1 + n2, n1, n) > pexact: break elif pguess < pexact: max = guess else: min = guess if guess == -1: guess = min while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon: guess -= 1 while hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon: guess += 1 p = plower + hypergeom.sf(guess - 1, n1 + n2, n1, n) if p > 1.0: p = 1.0 return odssratio, p else: pupper = hypergeom.sf(c[0,0] - 1, n1 + n2, n1, n) if hypergeom.pmf(0, n1 + n2, n1, n) > pexact / epsilon: return odssratio, pupper # Binary search for where to begin lower half. min = 0 max = mode guess = -1 while max - min > 1: guess = max if max == min + 1 and guess == min else (max + min) / 2 pguess = hypergeom.pmf(guess, n1 + n2, n1, n) if pguess <= pexact and hypergeom.pmf(guess + 1, n1 + n2, n1, n) > pexact: break elif pguess <= pexact: min = guess else: max = guess if guess == -1: guess = min while hypergeom.pmf(guess, n1 + n2, n1, n) < pexact * epsilon: guess += 1 while guess > 0 and hypergeom.pmf(guess, n1 + n2, n1, n) > pexact / epsilon: guess -= 1 p = pupper + hypergeom.cdf(guess, n1 + n2, n1, n) if p > 1.0: p = 1.0 return odssratio, p
lc = l.strip("\n").split("\t") tf[lc[1]] = 1 if lc[1] in expr: tf_num = tf_num + 1 if lc[1] in degs: tf_degs = tf_degs + 1 #print expr_num #print degs_num #print tf_num #print tf_degs ## [M, n, N] [express_genes, degs, tf_binding_genes] = [expr_num, degs_num, tf_num] #rv = hypergeom(express_genes, degs, tf_binding_genes) #x = np.arange(0, 10) #pmf_tfBindDegs = rv.pmf(1) #print np.sum(pmf_tfBindDegs) #print 1-np.sum(pmf_tfBindDegs) # the probability less than tf_degs prb = hypergeom.cdf(tf_degs, express_genes, degs, tf_binding_genes) #pvalue = 1 - hypergeom.cdf(tf_degs, express_genes, degs, tf_binding_genes) #if pvalue > 1: # print >>sys.stderr, prefix prefix = sys.argv[4] print "%s\t%.12f\t%d\t%d\t%d\t%d" % (prefix, prb, express_genes, degs, tf_binding_genes, tf_degs)
def Main(): t1=time() global min_interaction, p_value args=ParseArg() inp = open(args.input, 'r') min_clusterS=args.min_clusterS min_interaction=args.min_interaction p_value=args.p_value output=open(args.output,'w') ncpus=args.parallel #store genomic location of part1 and part2 part=[] k=0 print >> sys.stderr,"# Inputing data..." chr_list=[] for line in inp.read().split('\n'): if line=='': continue line=line.strip().split('\t') p1=annotated_bed(line[0:8],id=k,part=1) p2=annotated_bed(line[9:],id=k,part=2) if SingleFragment(p1,p2): continue k+=1 part.append(p1) part.append(p2) if p1.chr not in chr_list: chr_list.append(p1.chr) if p2.chr not in chr_list: chr_list.append(p2.chr) if k%20000==0: print >> sys.stderr," Reading %d pairs of segments\r"%(k), print >> sys.stderr,"Get total %d pairs."%(k) # sort in genomic order, easy for clustering part=sorted(part, key=attrgetter('start')) part=sorted(part, key=attrgetter('chr')) # for parallel computing print >>sys.stderr,"# Generating clusters for two parts..." # tuple of all parallel python servers to connect with ppservers = () job_server = pp.Server(ncpus, ppservers=ppservers) jobs=[] for chro in chr_list: part_temp=filter(lambda p: p.chr==chro, part) if len(part_temp)>0: jobs.append(job_server.submit(cluster_regions,(part_temp,min_clusterS),(annotated_bed,),("UnionFind","copy",))) cluster_pool={} part=[] for job in jobs: try: part=part+job()[1] cluster_pool.update(job()[0]) except: print >> sys.stderr, "Wrong in %s, part1"%(job()[2]) continue print >>sys.stderr," cluster number is %d "%(len(cluster_pool)) # sort back to pair two parts part=sorted(part, key=attrgetter('part')) part=sorted(part, key=attrgetter('id')) print >> sys.stderr,"size of part",len(part) c_interaction={} i=0 while i<len(part): P1=part[i] P2=part[i+1] assert P1.id==P2.id i+=2 print >> sys.stderr,"%d\r"%(i), if P1.cluster==P2.cluster: continue if P1.cluster<P2.cluster: inter=P1.cluster+"--"+P2.cluster else: inter=P2.cluster+"--"+P1.cluster if c_interaction.has_key(inter): c_interaction[inter]+=1 else: c_interaction[inter]=1 # annotation file print >> sys.stderr,"# Indexing annotation files" dbi_all=DBI.init(args.annotation,"bed") dbi_detail=DBI.init(args.db_detail,"bed") dbi_repeat=DBI.init("/home/yu68/bharat-interaction/new_lincRNA_data/mouse.repeat.txt","bed") print >> sys.stderr,"# finding strong interactions from clusters..." k=0 # record for strong interactions n=0 # annotation file for interaction in c_interaction: n=n+1 count=c_interaction[interaction] if count<min_interaction: continue i=interaction.split("--")[0] j=interaction.split("--")[1] try: # we select clusters with size no less than 5, so some interactions cannot be found in clusters count1=cluster_pool[i].cluster count2=cluster_pool[j].cluster except: continue real_p=1-hypergeom.cdf(count,len(part)/2,count1,count2) if real_p<=p_value: k=k+1 cluster_pool[i].Annotate(dbi_all,dbi_detail,dbi_repeat) cluster_pool[j].Annotate(dbi_all,dbi_detail,dbi_repeat) try: log_p = math.log(real_p) except: log_p = -float("Inf") print >> output,str(cluster_pool[i])+'\t'+str(cluster_pool[j])+'\t%d\t%.4f'%(count,log_p) if n%1000==0: print >> sys.stderr, " Progress ( %d / %d )\r"%(n,len(c_interaction)), print >> sys.stderr,"# Find %d strong interactions. Cost time: %.2f s"%(k,time()-t1)
def p_hypergeom(self,N_pop,n_chosen,K_pop,k_success): return hypergeom.cdf(k_success,N_pop,K_pop,n_chosen)
def plot_cell_enrichments( ds, f=None, enrichments=None, ax=None, title=None, ): LOGGER.info("Plotting cell type enrichments") if enrichments is None: enrichments = brs.enrichments.get_enrichments( list(ds.species)[0], ) if f is None: f = {'p': .05, 'asym_fold': 1.25} if isinstance(f, dict): f = [f] if ax is None: _, ax = plt.subplots(figsize=(4, 3)) cell_prots = { cell: [key for key, val in enrichments.items() if val == cell] for cell in brs.CELL_TYPES } display_name = { 'Myelinating Oligodendrocytes': 'Oligodendrocytes', } if sum([ 'Oligo' in cell and len(cell_prots.get(cell, [])) > 0 for cell in brs.CELL_TYPES ]) else {} vals = [] ds = ds.filter( protein=set(j for i in cell_prots.values() for j in i), fn=lambda x: len(x['Proteins']) < 2, ) hatches = [ "", "//", "o", "x", ".", ] for cell in brs.CELL_TYPES: for ind, fil in enumerate(f): dc = ds.filter(protein=set(cell_prots[cell])) fore_hits = dc.filter(fil).shape[0] fore_size = dc.shape[0] back_hits = ds.filter(fil).shape[0] back_size = ds.shape[0] if fore_size < 1 or back_size < 1: continue val = ( 1 - hypergeom.cdf(fore_hits, back_size, back_hits, fore_size) + hypergeom.pmf(fore_hits, back_size, back_hits, fore_size) ) vals.append( pd.Series( OrderedDict([ ('cell', display_name.get(cell, cell)), ('fore hits', fore_hits), ('fore size', fore_size), ('back hits', back_hits), ('back size', back_size), ('p-value', val), ('-log10 p-value', -np.log10(val)), ('color', brs.CELL_COLORS[cell]), ('hatch', hatches[ind % len(hatches)]), ('hue', format_title(f=fil)), ]) ) ) df = pd.DataFrame(vals) ax = sns.barplot( data=df, y='cell', x='-log10 p-value', hue='hue', ax=ax, ) ax.axvline(-np.log10(.01), color='k', linestyle=':') ax.legend( handles=[ mpatches.Patch( facecolor='w', edgecolor='k', hatch=i, label=df['hue'].iloc[ind], ) for ind, i in enumerate(hatches[:len(f)]) ] ) for hatch, color, p in zip( df['hatch'], df['color'], sorted(ax.patches, key=lambda x: x.xy[1]), ): p.set_hatch(hatch) p.set_facecolor(color) p.set_edgecolor('k') if title: ax.set_title(title) ax.set_ylabel('') ax.set_xlabel('p-value') ax.set_xticklabels(['{:.3}'.format(10 ** -i) for i in ax.get_xticks()]) return ax.get_figure(), ax
def Main(): t1=time() args=ParseArg() inp = open(args.input, 'r') min_clusterS=args.min_clusterS min_interaction=args.min_interaction p_value=args.p_value output=open(args.output,'w') #store count of RNA for part1 and part2 part1={} part2={} k=0 print >> sys.stderr,"# Inputing data..." interaction = {} # store number of interactions for different RNA Types = ["snoRNA","protein_coding","snRNA","lincRNA","tRNA","misc_RNA","pseudogene","miRNA","antisense","sense_intronic","non_coding","processed_transcript"] for line in inp.read().split('\n'): if line=='': continue line=line.strip().split('\t') p1=annotated_bed(line[0:8],id=k,cluster=1) p2=annotated_bed(line[9:],id=k,cluster=1) if SingleFragment(p1,p2): continue k+=1 if p1.type in Types: p1_name = p1.chr+":"+p1.name if p1_name not in part1: part1[p1_name]=1 else: part1[p1_name]+=1 if p2.type in Types: p2_name = p2.chr+":"+p2.name if p2_name not in part2: part2[p2_name]=1 else: part2[p2_name]+=1 if p1.type in Types and p2.type in Types: inter_name = p1_name+"--"+p2_name if inter_name not in interaction: interaction[inter_name]=[copy.deepcopy(p1),copy.deepcopy(p2)] else: interaction[inter_name][0].Update(p1.start,p1.end) interaction[inter_name][1].Update(p2.start,p2.end) interaction[inter_name][0].cluster+=1 if k%20000==0: print >> sys.stderr," Reading %d pairs of segments\r"%(k), print >> sys.stderr,"Get total %d pairs."%(k) print >>sys.stderr," number of different RNAs for part1 is %d "%(len(part1)) print >>sys.stderr," number of different RNAs for part2 is %d "%(len(part2)) total = k # total pairs used n=0 k=0 # record number of strong interactions for i in interaction: n+=1 count = interaction[i][0].cluster if count < min_interaction: continue p1_name = i.split("--")[0] p2_name = i.split("--")[1] P1 = interaction[i][0] P2 = interaction[i][1] P1.cluster = part1[p1_name] P2.cluster = part2[p2_name] if part1[p1_name]<min_clusterS or part2[p2_name]<min_clusterS: continue real_p=1-hypergeom.cdf(count,total,part1[p1_name],part2[p2_name]) if real_p<=p_value: k=k+1 try: log_p = math.log(real_p) except: log_p = -float("Inf") print >> output, str(P1)+'\t'+str(P2)+'\t%d\t%.4f'%(count,log_p) if n%100==0: print >> sys.stderr, " Progress ( %d / %d )\r"%(n,len(interaction)), print >> sys.stderr,"# Find %d strong interactions. Cost time: %.2f s"%(k,time()-t1)