def write_drug_rank(tbl_file, fig_file, drug2pro, ind2drug, ind2pro, M, N): # write targets ranked by enrichment p-value drug2pval = {} for k, v in drug2pro.items(): drug = ind2drug[k] q = drug[4] #m = len(v) m = sum([x[1] for x in v]) if m > q: # for predicted case, we keep use the total number of known chemicals interacting with a certain target, if predicted number greater than known, pval is 0. pval = hypergeom.sf(m - 1, M, m, N) else: pval = hypergeom.sf( m - 1, M, q, N ) # Hypergeometric test: the probability of getting more than (m-1) items from N, when the backgroud is q in M. drug2pval[k] = pval n = 1 out = open(tbl_file, 'w') out.write( 'Rank\tPubchem_ID\tChemical_Name\tMolecular_Weight\tSMILES\tProteins\tNum_Targets\tP_value\n' ) drug_name2erich_score = {} ## in order to draw the enrichment plot for k, v in sorted(drug2pval.items(), key=lambda x: x[1]): drug = ind2drug[k] pros = [(ind2pro[x[0]][4], x[1]) for x in drug2pro[k]] out.write( str(n) + '\t' + '\t'.join(str(x) for x in drug[0:-1]) + '\t' + ';'.join(str(x) for x in pros) + '\t' + str(len(pros)) + '\t' + str(v) + '\n') if n <= 20: name = drug[1] drug_name2erich_score[name] = -np.log10(v) n += 1 out.close() enrich_plot(drug_name2erich_score, fig_file, 'drug')
def precision_table(pairs, Ls): '''compute precisions at various number of pairs (any, and long-range), also computes p-values according to the hypergeometric test''' dists = [k for k in pairs.keys() if 'dist' in k] LR = pairs[pairs['LR']] M = len(pairs) n = np.sum(pairs['dist.any_struct'].lt(5)) LR_M = len(LR) LR_n = np.sum(pairs['dist.any_struct'].lt(5)) prec = {} for L in Ls: prec[str(L)] = {} prec['LR '+str(L)] = {} for d in dists: h = np.sum(pairs[d].head(L).lt(5)) LR_h = np.sum(LR[d].head(L).lt(5)) prec[str(L)][d] = str(100 * h/L)[:4] + '%' prec['LR '+str(L)][d] = str(100 * LR_h/L)[:4] + '%' h = np.sum(pairs['dist.any_struct'].head(L).lt(5)) LR_h = np.sum(LR['dist.any_struct'].head(L).lt(5)) prec[str(L)]['p-value (any struct)'] = hypergeom.sf(h-1, M, n, L) prec['LR '+str(L)]['p-value (any struct)'] = hypergeom.sf(LR_h-1, LR_M, LR_n, L) prec = pd.DataFrame(prec) return(prec)
def calc_HG_test(genelist_dataset, genes_subset_ranking, ranking, th=1): b = len(list(filter(lambda x: x[0] == th, genes_subset_ranking))) B = len(genes_subset_ranking) N = len(genelist_dataset[0]) n = len(list(filter(lambda x: x == th, ranking))) print "run HG test with {},{},{},{}".format(b, N, B, n) print hypergeom.sf(b - 1, N, B, n)
def hg_scoring(self, data_links, type='spec-gcf'): """ Calculate metcalf scores from DataLinks() co-occurence matrices """ # NOTE:can't use the correlation matrices directly for this scoring method because # it seems to require more inclusive counts of the strains in each object. # Instead of "number of strains only in GCF", it requires "number of strains in the # GCF PLUS the number shared between the GCF and the other object". # e.g. if a spectrum has 3 strains, a GCF has 1 strain and there is 1 shared strain, # M_spec_gcf will correctly contain "1", but M_type1_notgcf will contain "2" instead # of "3", because the spectrum only has 2 distinct strains vs the GCF. # To fix this the M_spec_gcf/M_fam_gcf matrix can just be added onto the others to give # the correct totals. if type == 'spec-gcf': num_strains = np.ones(data_links.M_spec_gcf.shape) * data_links.M_gcf_strain.shape[1] overlap_counts = data_links.M_spec_gcf gcf_counts = overlap_counts + data_links.M_notspec_gcf spec_counts = overlap_counts + data_links.M_spec_notgcf hg_scores = hypergeom.sf(overlap_counts, num_strains, gcf_counts, spec_counts, loc=1) self.hg_spec_gcf = hg_scores elif type == 'fam-gcf': num_strains = np.ones(data_links.M_fam_gcf.shape) * data_links.M_gcf_strain.shape[1] overlap_counts = data_links.M_fam_gcf gcf_counts = overlap_counts + data_links.M_notfam_gcf fam_counts = overlap_counts + data_links.M_fam_notgcf hg_scores = hypergeom.sf(overlap_counts, num_strains, gcf_counts, fam_counts, loc=1) self.hg_fam_gcf = hg_scores return hg_scores
def hypergeometric_test(directory): locations = ['C', 'M'] # directory = '/Users/rona/data/teraserve/chapter-1/Wolfpsort/relocalisation_duplication' for i, item in enumerate(locations): file_1 = pd.read_csv(f"{directory}/output_{item}_gain.csv") file_2 = pd.read_csv(f"{directory}/output_{item}_loss.csv") x_gains = file_1['reloc_following_dup'].sum() M_gains = file_1['number_of_dups'].sum( ) + file_1['number_of_specs'].sum() n_gains = file_1['reloc_following_spec'].sum( ) + file_1['reloc_following_dup'].sum() N_gains = file_1['number_of_dups'].sum() print(item + ' gains') print(hypergeom.sf(x_gains - 1, M_gains, n_gains, N_gains)) x_losses = file_2['reloc_following_dup'].sum() M_losses = file_2['number_of_dups'].sum( ) + file_2['number_of_specs'].sum() n_losses = file_2['reloc_following_spec'].sum( ) + file_2['reloc_following_dup'].sum() N_losses = file_2['number_of_dups'].sum() print(item + ' losses') print(hypergeom.sf(x_losses - 1, M_losses, n_losses, N_losses))
def write_tar_rank(tbl_file, fig_file, pro2drug, ind2pro, M, N): # write targets ranked by enrichment p-value pro2pval = {} for k, v in pro2drug.items(): pro = ind2pro[k] q = pro[3] #m = len(v) m = sum([x[1] for x in v]) if m > q: # for predicted case, we keep use the total number of known chemicals interacting with a certain target, if predicted number greater than known, pval is 0. pval = hypergeom.sf(m - 1, M, m, N) else: pval = hypergeom.sf( m - 1, M, q, N ) # Hypergeometric test: the probability of getting more than (m-1) items from N, when the backgroud is q in M. pro2pval[k] = pval n = 1 out = open(tbl_file, 'w') out.write( 'Rank\tUniprot_ID\tProtein_Name\tEntry_Name\tTot_Num_Chemicals\tGene_Name\tGene_ID\tPDB\tPathway_Ids\tPathway_Names\tGO_Function\tGO_Process\tGO_Component\tChemicals\tNum_Chemicals\tP_value\n' ) pro_name2enrich_score = {} for k, v in sorted(pro2pval.items(), key=lambda x: x[1]): pro = ind2pro[k] drugs = [(ind2drug[x[0]][1], x[1]) for x in pro2drug[k]] out.write( str(n) + '\t' + '\t'.join(str(x) for x in pro[0:-1]) + '\t' + ';'.join(str(x) for x in drugs) + '\t' + str(len(drugs)) + '\t' + str(v) + '\n') if n <= 20: pro_name2enrich_score[pro[1]] = -np.log10(v) n += 1 out.close() enrich_plot(pro_name2enrich_score, fig_file, 'target')
def getMultiplePsFdr(iva, ivb, model, N, win=6): """ for the interval a and b, searching its nearby windows to estimate FDR and p-values. THe idea that using matched nearby windows, which could have similar distance with a & b, needs too many windows. return ra, rb, rab, es, fdr, hyp, chyp, pop, nbp """ ra, rb, rab = getPETsforRegions(iva, ivb, model) #simple hypergeometric test, the idea using cis_a + cis_b + trans_a+trans_b as M and cis_a+cis_b as N fails with all p-value as 1 hyp = hypergeom.sf(rab - 1.0, N, ra, rb) ivas, ivbs = getNearbyPairRegions(iva, ivb, win=win) hyps, rabs, nbps = [], [], [] for na in ivas: nraSource = getCounts(na, model[0]) nraTarget = getCounts(na, model[1]) nra = nraSource.union(nraTarget) nralen = float(len(nra)) if nralen < 1: continue for nb in ivbs: nrbSource = getCounts(nb, model[0]) nrbTarget = getCounts(nb, model[1]) nrb = nrbSource.union(nrbTarget) nrblen = len(nrb) if nrblen < 1: continue nrab = float(len(nra.intersection(nrb))) #nrab = float(len(nraSource.intersection(nrbTarget))) #collect the value for poisson test rabs.append(nrab) #collect the nearby hypergeometric test result nhyp = hypergeom.sf(nrab - 1.0, N, nralen, nrblen) hyps.append(nhyp) #collect the possibility for following binomal test den = nrab / (nralen * nrblen) nbps.append(den) if len(rabs) == 0: return ra, rb, rab, np.inf, 0.0, hyp, 0.0, 0.0, 0.0, hyps, rabs = np.array(hyps), np.array(rabs) #local fdr fdr = len(rabs[rabs > rab]) / float(len(rabs)) mrabs = float(np.mean(rabs)) #enrichment score if mrabs > 0: es = rab / mrabs else: es = np.inf #es = rab / max([np.mean(rabs),float(np.percentile(rabs,90))]) #es = rab / float(np.percentile(rabs,90)) #corrected hypergeometric fdr chyp = len(hyps[hyps < hyp]) / float(len(hyps)) #simple possion test, the idea benefits from MACS as using dynamic lambda lam = mrabs pop = poisson.sf(rab - 1.0, lam) #simple binomal test bp = np.mean(nbps) * ra * rb / N #nbp = binom.sf(rab, N, bp) nbp = binom.sf(rab - 1.0, N - rab, bp) return ra, rb, rab, es, fdr, hyp, chyp, pop, nbp
def calc_HG_test(total_gene_list_N, tests_gene_list_B, total_gene_list_n): b = len(set(total_gene_list_n).intersection(set(tests_gene_list_B))) B = len(set(tests_gene_list_B)) # .intersection(set(total_gene_list_N))) N = len(total_gene_list_N) n = len(total_gene_list_n) print "run HG test with {},{},{},{}".format(b, N, B, n) return "{}\t({} {} {} {})".format(hypergeom.sf(b - 1, N, B, n), b, N, B, n)
def _prob_hypergeo_fast(y_compute, name, X, M, n, N): """Compute hypergeometric Pvalue. Description ----------- Suppose you have a lot of 100 floppy disks (M), and you know that 20 of them are defective (n). What is the prbability of drawing zero to 2 floppy disks (N=2), if you select 10 at random (N). P=hypergeom.sf(2,100,20,10) """ P = np.nan logP = np.nan # M = len(yc) # Population size: Total number of samples, eg total number of genes; 10000 # n = np.sum(datac) # Number of successes in population, known in pathway, eg 2000 # N = np.sum(yc) # sample size: Random variate, eg clustersize or groupsize, over expressed genes, eg 300 # X = np.sum(np.logical_and(yc, datac.values)) - 1 # Let op, de -1 is belangrijk omdatje P<X wilt weten ipv P<=X. Als je P<=X doet dan kan je vele false positives krijgen als bijvoorbeeld X=1 en n=1 oid # Do the hypergeo-test if y_compute and (X > 0): P = hypergeom.sf(X, M, n, N) logP = hypergeom.logsf(X, M, n, N) # Store out = {} out['category_label'] = name out['P'] = P out['logP'] = logP out['overlap_X'] = X out['popsize_M'] = M out['nr_succes_pop_n'] = n out['samplesize_N'] = N out['dtype'] = 'categorical' return (out)
def tf_hyper_geom(selected_genes: np.ndarray, epi_data_clustered): M = selected_genes.size N = np.sum(epi_data_clustered) n = np.sum(selected_genes) k = np.sum(selected_genes * epi_data_clustered) - 1 pval = hypergeom.sf(k, M, n, N) return pval < 0.1
def enrichmentOneSided(subsetGO, backgroundTotal, backgroundGO, subsetTotal): """ Performs a one-sided (enrichment) hypergeometric test for a given GO term. k or more successes (= GO associations = subsetGO) in N draws (= subsetTotal) from a population of size M (backgroundTotal) containing n successes (backgroundGO) k or more is the sum of the probability mass functions of k up to N successes since cdf gives the cumulative probability up and including input (less or equal to k successes), and we want P(k or more), we need to calculate 1 - P(less than k) = 1 - P(k-1 or less) sf is the survival function (1-cdf). Parameters ---------- subsetGO : int The number of genes in the interest subset associated with a GO term. backgroundTotal : int The total number of genes in the background set. backgroundGO : int The number of genes in the background set associated with the GO term. subsetTotal : int The total number of genes in the interest subset. Returns ------- float The p-value of the one-sided hypergeometric test. """ pVal = hypergeom.sf(subsetGO - 1, backgroundTotal, backgroundGO, subsetTotal) return pVal
def get_param(self): dirname = 'C:/Users/Mingyu/gsea_home/output/mar25/my_analysis.Gsea.1585092763735' flist = os.listdir(dirname) fmap = OrderedDict() for f in flist: if f.endswith('.xls') and 'CHR' in f: fmap[os.path.splitext(f)[0]] = os.path.join(dirname, f) fpath = os.path.join(self.root, 'database/Fantom/v5/cell_lines', 'Regression_filter.xlsx') df_lasso = pd.read_excel(fpath, index_col=0) fpath = os.path.join(self.root, 'database/Fantom/v5/cell_lines', 'Regression_filter_hyper.xlsx') df = pd.read_excel(fpath, index_col=0) df_res = pd.DataFrame(index=df.index) for fname in df.index: gset = df.loc[fname, 'GENEs'].split(':') comp = df.loc[fname, 'Comp'].split(':') mirna = df.loc[fname, 'miRNA'].split(':') df_gsea = pd.read_csv(os.path.join(dirname, fname + '.xls'), sep='\t') for genes, mir in zip(gset, mirna): genes = genes[1:-1].split(',') x = len(genes) m = df_gsea.shape[0] n = len(comp) k = len(df_lasso.loc[mir, 'GENEs'].split(';')) p = hypergeom.sf(x, n+m, m, k) df_res.loc[fname, mir] = ';'.join(map(str, [x, m, n, k, p])) df_res.dropna(how='all', axis=1).to_excel(os.path.join(self.root, 'database/Fantom/v5/cell_lines', 'p_values.xlsx'))
def calc_all_data(data, diff_gene_hascell, n, N, pvalue, ratio): cellname = [] pval = [] odd = [] exp = [] cout = [] size = [] for i in CellName: M = data[data[1] == i].shape[0] exp_count = n*M/N k = 0 for j in diff_gene_hascell: if i in data[data[0] == j][1].tolist(): k = k+1 if method == "Fisher": OddsRatio = fisher_exact([[M-k,N-M-n+k],[k, n-k]])[0] p = fisher_exact([[M-k,N-M-n+k],[k, n-k]])[1] else: OddsRatio = k/exp_count # stats.hypergeom.sf(x, m+n, m, k)==(x, m, n, k, lower.tail=FALSE) p = hypergeom.sf(k-1, N, M, n) cellname.append(i) pval.append(p) odd.append(OddsRatio) exp.append(exp_count) cout.append(k) size.append(M) qvalues = qvalue(pval) fin = pd.DataFrame({"CellName": cellname, "Pvalue": pval, "p.adjust": qvalues, "oddsRatio": odd, "ExpCount": exp, "Count": cout, "Size": size}) fin_fliter = fin[(fin['Pvalue'] < pvalue) & (fin['oddsRatio'] >= ratio)] return fin_fliter
def compute_hypergeometric_score(self, complete_list, target_list): M = complete_list.initialLength n = self.afterIntersectionLength[0] N = target_list.initialLength x = self.afterIntersectionLength[-1] self.hypergeometricScore = hypergeom.sf(x-1, M, n, N) return self.hypergeometricScore
def get_enriched_properties(nodes, semantic_type, pcut=1e-4): if semantic_type in [ 'biolink:SmallMolecule', 'biolink:MolecularMixture', 'biolink:Drug' ]: semantic_type = 'biolink:ChemicalEntity' if semantic_type not in ['biolink:ChemicalEntity']: return [] property_lookup = PropertyLookup() properties = property_lookup.collect_properties( nodes, semantic_type) # properties = {property: (curies with it)} enriched = [] for property, curies in properties.items(): # The hypergeometric distribution models drawing objects from a bin. # M is the total number of objects (nodes) , # n is total number of Type I objects (nodes with that property). # The random variate represents the number of Type I objects in N drawn # without replacement from the total population (len curies). x = len(curies) # draws with the property total_node_count = property_lookup.get_nodecount(semantic_type) n = property_lookup.total_nodes_with_property(property, semantic_type) ndraws = len(nodes) enrichp = hypergeom.sf(x - 1, total_node_count, n, ndraws) if enrichp < pcut: enriched.append( (enrichp, property, ndraws, n, total_node_count, curies)) enriched.sort() return enriched
def enrichment(self, locus_list, pval_cutoff=0.05, max_term_size=300, min_term_size=5): ''' Evaluates enrichment of loci within the locus list in terms within the ontology. NOTE: this only tests terms that have at least one locus that exists in locus_list. Parameters ---------- locus_list : list of co.Locus A list of loci for which to test enrichment. i.e. is there an over-representation of these loci within and the terms in the Ontology. pval_cutoff : float (default: 0.05) Report terms with a pval lower than this value max_term_size : int (default: 300) The maximum term size for which to test enrichment. Useful for filtering out large terms that would otherwise be uninformative (e.g. top level GO terms) min_term_size : int (default: 5) ''' terms = list( filter( lambda t: (len(t) >= min_term_size) and (len(t) <= max_term_size), [self[name] for name, in self.db.cursor().execute( '''SELECT DISTINCT(term) FROM term_loci WHERE id IN ('{}') '''.format( "','".join([x.id for x in locus_list]) ) ).fetchall() ] ) ) num_universe = len(set(chain(*[x.loci for x in terms]))) self.log( 'test loci occur in {} terms, containing {} genes'.format( len(terms), num_universe ) ) significant_terms = [] for term in terms: term_genes = term.loci if len(term_genes) > max_term_size: continue num_common = len(term_genes.intersection(locus_list)) num_in_term = len(term_genes) num_sampled = len(locus_list) pval = hypergeom.sf(num_common-1,num_universe,num_in_term,num_sampled) if pval <= pval_cutoff: term.attrs['pval'] = pval term.attrs['hyper'] = { 'pval' : pval, 'num_common' : num_common, 'num_universe' : num_universe, 'num_in_term' : num_in_term, 'sum_sampled' : num_sampled } significant_terms.append(term) return significant_terms
def compute_pvalues_by_hypergeom(self, **kwargs): print('Using the hypergeometric test to calculate enrichment...') if kwargs: print('Overwriting global settings:') for k in kwargs: print('\t%s=%s' % (k, str(kwargs[k]))) N = np.zeros([self.graph.number_of_nodes(), len(self.attributes)]) + self.graph.number_of_nodes() N_in_group = np.tile(np.nansum(self.node2attribute, axis=0), (self.graph.number_of_nodes(), 1)) N_in_neighborhood = np.tile( np.sum(self.neighborhoods, axis=0)[:, np.newaxis], (1, len(self.attributes))) N_in_neighborhood_in_group = np.dot( self.neighborhoods, np.where(~np.isnan(self.node2attribute), self.node2attribute, 0)) self.pvalues_pos = hypergeom.sf(N_in_neighborhood_in_group - 1, N, N_in_group, N_in_neighborhood) # Correct for multiple testing if self.multiple_testing: print('Running FDR-adjustment of p-values...') out = np.apply_along_axis(fdrcorrection, 1, self.pvalues_pos) self.pvalues_pos = out[:, 1, :] # Log-transform into neighborhood enrichment scores (NES) self.nes = -np.log10(self.pvalues_pos)
def calculate_pvalues(nodes, query, background_attribute, M, min_category_size=3, max_category_size=500, max_category_depth=5, **kwargs): """ calculate pvalues for all categories in the graph :param G: ontology graph after background was set :param query: set of identifiers :param background_attribute: node attribute assoc. with the background set :param min_category_size: categories smaller than this number are ignored :param max_category_size: categories larger than this number are ignored :returns: pvalues, x, n """ N = len(query) vals = [] for node in nodes: background = node[background_attribute] n = len(background) hits = query.intersection(background) x = len(hits) if ((node.get('depth', 0) > max_category_depth) or (n <= min_category_size) or (n > max_category_size)): vals.append((float('NaN'), x, n)) else: vals.append((hypergeom.sf(x, M, n, N), x, n)) return zip(*vals)
def calc_feat_overrep_lin(condition, ttype): cells = ['Gm12878', 'H1hesc', 'Helas3', 'Hepg2', 'Huvec', 'K562', 'Nhek'] keys = ['H', 'B', 'C'] nonzeros = dict.fromkeys(keys, np.zeros(len(cells))) f_len = dict.fromkeys(keys, None) nz_len = dict.fromkeys(keys, None) for c, cell in enumerate(cells): hist_scores = pickle.load(open('predict4/' + 'results/histScores'+condition+cell+ttype+'.pkl', 'rb')) features = hist_scores.keys() fs_scores = np.array(hist_scores.values()) ind = dict.fromkeys(keys, None) ind['H'] = np.array([i for i,v in enumerate(features) if v.startswith('H') and v.endswith('3')]) ind['B'] = np.array([i for i,v in enumerate(features) if v.startswith('H') and v.endswith('1')]) ind['C'] = np.array([i for i,v in enumerate(features) if v.startswith('E')]) for k in keys: nonzeros[k][c] = len(np.nonzero(fs_scores[ind[k]])[0]) for k in keys: f_len[k] = len(fs_scores[ind[k]]) nz_len[k] = math.floor(nonzeros[k].mean()) t_nz = sum([nz_len[k] for k in keys]) print condition, ttype pvals = [] for k in keys: h_score = hypergeom.sf(nz_len[k], len(features), t_nz, f_len[k]) pvals.append(h_score) print k, h_score return pvals
def GSEA(refset, quiery, clumpsetbd, genesetdb, coord_gene_dict, gen='no'): pval_arrays_new = list() N = len(refset) n = len(set(quiery.keys())) genes = defaultdict(list) passed_genes = set() nums = list() n_adj = 0 for index in quiery.keys(): index_set = set() for snp in quiery[index]: if snp in coord_gene_dict.keys( ) and " not_set" not in coord_gene_dict[snp][1]: indexes = int(coord_gene_dict[snp][1]) for i in range(indexes): index_set.add(coord_gene_dict[snp][i + 2]) genes[index].append(coord_gene_dict[snp][i + 2]) if len(index_set) != 0 and len( passed_genes.union(index_set)) > len(passed_genes): n_adj += 1 passed_genes = passed_genes.union(index_set) n = n_adj for gene_set in clumpsetbd: K = len(set(clumpsetbd[gene_set])) count = set() igenes = set() for index in genes.keys(): if len(set(genesetdb[gene_set]).intersection(set( genes[index]))) != 0: count.add(index) igenes.update( set(genesetdb[gene_set]).intersection(set(genes[index]))) k = len(count) if gen == 'yes': k = min(len(set(genesetdb[gene_set]).intersection(igenes)), n_adj, K, k) # n=n-(len(count)-k) nums.append( (N, n, K, k, ";".join(list(set(genesetdb[gene_set]).intersection(igenes))))) pval_arrays_new.append(hypergeom.sf(k - 1, N - k, n, K)) qval_arrays = list(estimate(np.asarray(pval_arrays_new))) results = defaultdict(list) i = 0 for geneset in clumpsetbd: if qval_arrays[i] <= 0.1: results[geneset].append(pval_arrays_new[i]) results[geneset].append(qval_arrays[i]) results[geneset].append(nums[i]) i += 1 if len(results) != 0: df = pd.DataFrame.from_dict(results, orient='index') df.reset_index(inplace=True) df = df.sort_values(by=[1]) return (df) else: return (list())
def calc_feat_overrep_mlp(condition, ttype): cells = ['Gm12878', 'H1hesc', 'Helas3', 'Hepg2', 'Huvec', 'K562', 'Nhek'] keys = ['H', 'B', 'C'] nonzeros = dict.fromkeys(keys, np.zeros(len(cells))) f_len = dict.fromkeys(keys, None) nz_len = dict.fromkeys(keys, None) for c, cell in enumerate(cells): folder = 'predict4/' mlp_scores = pickle.load(open(folder + 'results/mlpMaskCoefs'+cell+condition+ttype+'.pkl', 'rb')) fs_scores = garson(mlp_scores) with open(folder + 'train/Gm12878.matrix', 'r') as f: features = f.readline().rstrip('\n').split('\t')[1:] hist_scores = dict(zip(features, fs_scores)) features = hist_scores.keys() fs_scores = np.array(hist_scores.values()) ind = dict.fromkeys(keys, None) ind['H'] = np.array([i for i,v in enumerate(features) if v.startswith('H') and v.endswith('3')]) ind['B'] = np.array([i for i,v in enumerate(features) if v.startswith('H') and v.endswith('1')]) ind['C'] = np.array([i for i,v in enumerate(features) if v.startswith('E')]) for k in keys: nonzeros[k][c] = len(np.nonzero(fs_scores[ind[k]])[0]) for k in keys: f_len[k] = len(fs_scores[ind[k]]) nz_len[k] = math.floor(nonzeros[k].mean()) t_nz = sum([nz_len[k] for k in keys]) print condition, ttype pvals = [] for k in keys: h_score = hypergeom.sf(nz_len[k], len(features), t_nz, f_len[k]) pvals.append(h_score) print k, h_score return pvals
def plot_ratio_VMinOverV_vsN(rO,rV,rangeN,thre,lowV): pH=[] rN=[] for Ni in rangeN: # Find the lowest N for which the function applies given lowV lowN = int(math.floor(lowV/rV)) #print("lowN, N --> ", lowN, ", ", Ni) if Ni < lowN: continue O = int(math.floor(Ni*rO)) V = int(math.floor(Ni*rV)) Vmin_tmp = math.floor(0.001*V) Vmin = max(Vmin_tmp,lowV) #print("N, O, V: ",Ni,", ",O,", ",V," varying Vmin to find proba < ",thre) proba_thre = 1 Vi = Vmin Vbin = 5 Vthre = 0 while proba_thre > thre: p = math.floor(Vi/2) proba_thre = hypergeom.sf(p, Ni, O, Vi) Vthre = Vi #print(Vi,", prob --> ",proba_thre) Vi = Vi + Vbin #print("--> ",float(Vthre)/float(V)) pH.append(float(Vthre)/float(V)) rN.append(Ni) return (rN, pH)
def calcPValues(n, N1, params, P_actu): return [ hypergeom.sf(P_actu[i], N1, params[i], n) + 0.5 * hypergeom.pmf(P_actu[i], N1, params[i], n) for i in range(len(P_actu)) ]
def find_hypergeometric(genes, pred_no_training): overlap = list(set(genes) & set(pred_no_training)) M = 10683 #M=20000 N = len(genes) n = len(pred_no_training) x = len(overlap) pval = hypergeom.sf(x - 1, M, n, N) rv = hypergeom(M, n, N) distr = np.arange(0, n + 1) #print (N, n, x) prob = rv.pmf(distr) maximum = np.max(prob) result = np.where(prob == maximum) #print (result) #result=result.tolist() result = result[0] #print (result) fold = x / result fold = fold.tolist() print('Fold Enrichment', fold) print('hypergeometric p-value', pval) return fold
def hypergeom_test( positive_samples: int, samples: int, positive_total: int, total: int ) -> float: """ Wrapper function to call the scipy hypergeometric stats function Parameters ---------- positive_samples: int Number of successes in the sample set (correctly drawn marbles) samples: int Total number of samples (number of drawn marbles) positive_total: int Number of positives in the reference set (number of positive marbles in the bag) total: int Total size of reference set (number of marbles in the bag) Returns ------- float The hypergeometic enrichment score """ return float(hypergeom.sf( positive_samples-1, # likelyhood of more than X, #see https://blog.alexlenail.me/understanding-and-implementing-the-hypergeometric-test-in-python-a7db688a7458 # noqa: 501 total, positive_total, samples ))
def calculate_escore(indices, N, X, L, hgp_thresh, tol): """Calculate the XL-mHG E-score, using scipy to calculate HG p-values.""" assert isinstance(indices, np.ndarray) and indices.ndim == 1 and \ np.issubdtype(indices.dtype, np.uint16) assert isinstance(N, int) assert isinstance(X, int) assert isinstance(L, int) assert isinstance(hgp_thresh, float) assert isinstance(tol, float) K = indices.size k = 0 escore = 0.0 for i in indices: if i >= L: break n = i+1 k += 1 if k >= X: e = k / ((n*K)/float(N)) if e > escore and not mhg.is_equal(e, escore, tol): hgp = hypergeom.sf(k - 1, N, K, n) if hgp <= hgp_thresh or mhg.is_equal(hgp, hgp_thresh, tol): escore = e if escore == 0.0: escore = float('nan') return escore
def HyperGeometric(self, deg_in_special_go, allgene_has_go, allgene_in_special_go, alldeg_has_go): ''' 这里用来存GO注释的结果,包含了差异表达基因的GO注释和所有背景基因的GO注释 hypergeom.sf(deg_in_special_go-1,allgene_has_go,allgene_in_special_go,alldeg_has_go) 超几何分布的方法 GO example: extracellular region 61 in 715 DEGs, 195 in 4564 all GO genes 这里作为例子的输入依次为61个差异基因属于某一个GO term,总共有4564个基因有GO注释,195个属于某一个GO term,我们的DEG有715个 差异 非差异 属于某一个GO term 100 1000 不属于某一个GO term 500 10000 表示总共有11600个基因,其中600个位差异基因,属于某一个GO注释目录的有1100,其中属于差异基因的有100个 x=100,m=1100,n=10500,k=600 x,m+n,m,k stats.hypergeom.sf(99,11600,1100,600) ''' a = deg_in_special_go - 1 self.pValue = float( hypergeom.sf(a, allgene_has_go, allgene_in_special_go, alldeg_has_go)) self.enrich = float((deg_in_special_go / alldeg_has_go) / (allgene_in_special_go / allgene_has_go))
def calc_hg_enrichment_pval(mat, a, arm_a, aneu_type_a, b, arm_b, aneu_type_b): n_overlap = np.sum( np.logical_and( mat.loc[:, "{}{}".format(a, arm_a)].values == aneu_type_a, mat.loc[:, "{}{}".format(b, arm_b)].values == aneu_type_b)) n_a = np.sum( np.logical_and( mat.loc[:, "{}{}".format(a, arm_a)].values == aneu_type_a, mat.loc[:, "{}{}".format(b, arm_b)].values != aneu_type_b)) n_b = np.sum( np.logical_and( mat.loc[:, "{}{}".format(a, arm_a)].values != aneu_type_a, mat.loc[:, "{}{}".format(b, arm_b)].values == aneu_type_b)) # pval=hypergeom.sf(n_overlap, mat.shape[0], n_overlap+n_a, n_overlap+n_b) \ # + hypergeom.pmf(n_overlap, mat.shape[0], n_overlap+n_a, n_overlap+n_b) pval=hypergeom.sf(n_overlap, mat.shape[0], n_overlap+n_a, n_overlap+n_b) \ + hypergeom.pmf(n_overlap, mat.shape[0], n_overlap+n_a, n_overlap+n_b) # n_overlap+n_a+n_overlap+n_b # tbl=[[n_overlap, n_b], [n_a, mat.shape[0]-(n_overlap+n_b+n_a)]] # pval_1=fisher_exact(tbl, 'greater') # if a==1 and arm_a=='p' and aneu_type_a==-1 and b==2 and arm_b=='q' and aneu_type_b==-1: # print (n_overlap, mat.shape[0], n_overlap+n_a, n_overlap+n_b) # print pval, pval_1[1] return pval
def computeP(self,db,length,totWords): # Computes the p value using hypergeometric distribution # Also finds and assigns the total word count from the database # Word count database must be built by 'totalWordCounts' function from scipy.stats import hypergeom self.total = db[self.word] self.p = hypergeom.sf((self.freq-1),totWords,self.total,length)
def calculate_pvalues(nodes, query, background_attribute, M, min_category_size=3, max_category_size=500, max_category_depth=5, **kwargs): """ calculate pvalues for all categories in the graph :param nodes: nodes dictionary from the ontology graph after background was set :param query: set of identifiers for which the p value is calculated :param background_attribute: node attribute assoc. with the background set :param M: background size, total number of genes in the data :param min_category_size: categories smaller than this number are ignored :param max_category_size: categories larger than this number are ignored :param max_category_depth: categories lower in the hierarchy (more specific) will be ignored :returns: pvalues, x, n """ N = len(query) vals = [] for node in nodes: category = node[background_attribute] n = len(category) hits = query.intersection(category) x = len(hits) if ((node.get('depth', 0) > max_category_depth) or (n <= min_category_size) or (n > max_category_size)): vals.append((float('NaN'), x, n)) else: vals.append((hypergeom.sf(x-1, M, n, N), x, n)) return [np.array(x) for x in zip(*vals)]
def get_pval(setA, setB, population): x = np.isin(setA,setB).sum() # number of successes M = len(population) # pop size k = len(setB) # successes in pop N = len(setA) # sample size pval = hypergeom.sf(x-1, M, k, N) return pval
def _perform_enrichment_bonferroni(clusters, annotations, n_cores=1 ): #TODO implement parallel execution N = len(np.unique(annotations.index)) bonferroni_correction = len(list(np.unique( annotations['annotations']))) * len(clusters) p_values = [] genes_to_count = [] enriched_clusters = 0 for i, c in enumerate(clusters): enriched = False genes_at_least_one = list( set(annotations.index).intersection(set(list(c.ravel())))) n = len(genes_at_least_one) ann_c = annotations.loc[genes_at_least_one] for a in np.unique(ann_c['annotations']): genes_annotated = annotations[annotations['annotations'] == a] K = genes_annotated.shape[0] k = ann_c[ann_c['annotations'] == a].shape[0] pval = hypergeom.sf(k - 1, N, K, n) if pval < (0.05 / bonferroni_correction): genes_to_count += \ list(np.unique(ann_c[ann_c['annotations']==a].index)) p_values.append((i, pval, a)) enriched = True if enriched: enriched_clusters += 1 return p_values, len( set(genes_to_count)) / N, enriched_clusters / len(clusters)
def plot_ratio_VoverN(rO,rN,thre,lowV): pH=[] rangeN=[] for Ni in rN: #For each rangeN in range 5k - 100k Vmin = max(0.001*Ni,lowV) #Minumum value of V is 0.001 of the value of the range interval #Vmin = math.max(V_min as defined in the parameters of the fonction 2000, math.floor(0.001*Ni) #print("lowN, N --> ", Vmin, ", ", Ni) if Ni < lowV: print("check your param") continue O=Ni*rO #O = Each range x the ratio of bad nodes specified #print("N, O: ",Ni,", ",O,". Varying V to find proba ~ 10-9") proba_thre = 1 Vi = Vmin #rVi is set to 0.001 of the value of the range interval Vbin = 5 #bin size of 10 V_thre = 0 while proba_thre > thre: #while value set on l32 > value set on l52 ... p = math.floor(Vi/2) + 1 #math.floor rounds to the nearest value proba_thre = hypergeom.sf(p, Ni, O, Vi) V_thre = Vi #print(Vi,", prob --> ",proba_thre) Vi = Vi + Vbin if V_thre < lowV: print("failed for (N,V,O) = (",Ni,", ",V_thre,", ",rO*100,") --> ",float(V_thre)/float(Ni)) continue #else: #print("success for (N,V,O) = (",Ni,", ",V_thre,", ",rO*100,") --> ",float(V_thre)/float(Ni)) pH.append(float(V_thre)/float(Ni)) rangeN.append(Ni) return (rangeN,pH)
def get_xlmhg_stat_slow(v, X, L, tol=1e-12): # calculate the XL-mHG test statistic (inefficient) # type checking assert isinstance(v, np.ndarray) assert v.dtype == np.uint8 assert isinstance(X, int) assert isinstance(L, int) # check if values are valid N = v.size if not (1 <= X <= N): raise ValueError('Invalid value X=%d; should be >= 1 and <= N.' %(X)) if not (1 <= L <= N): raise ValueError('Invalid value L=%d; should be >= 1 and <= N.' %(L)) K = int(np.sum(v != 0)) if K == 0: # special case when K=0 return 1.0, 0 k = 0 stat = 1.1 n_star = 0 for i in range(L): if v[i] != 0: k += 1 if k >= X: hgp = hypergeom.sf(k-1, N, K, i+1) if hgp < stat and not mhg.is_equal(hgp, stat, tol): stat = hgp n_star = i + 1 stat = min(stat, 1.0) return stat, n_star
def test_custom_domain_iterable(self, gene_ontology): features_of_interest = gene_ontology.all_genes[:10] domain = ['cellular_component', 'molecular_function'] test_enrichment_df = gene_ontology.enrichment(features_of_interest, domain=domain) domains = frozenset(domain) p_value_cutoff = 1000000 min_feature_size = 3 min_background_size = 5 cross_reference = {} background = gene_ontology.all_genes n_all_genes = len(background) n_features_of_interest = len(features_of_interest) enrichment = defaultdict(dict) for go_term, go_genes in gene_ontology.ontology.items(): if go_genes['domain'] not in domains: continue features_in_go = go_genes['genes'].intersection( features_of_interest) background_in_go = go_genes['genes'].intersection(background) too_few_features = len(features_in_go) < min_feature_size too_few_background = len(background_in_go) < min_background_size if too_few_features or too_few_background: continue # Survival function is more accurate on small p-values p_value = hypergeom.sf(len(features_in_go), n_all_genes, len(background_in_go), n_features_of_interest) p_value = 0 if p_value < 0 else p_value symbols = [cross_reference[f] if f in cross_reference else f for f in features_in_go] enrichment['p_value'][go_term] = p_value enrichment['n_features_of_interest_in_go_term'][go_term] = len( features_in_go) enrichment['n_background_in_go_term'][go_term] = len( background_in_go) enrichment['n_features_total_in_go_term'][go_term] = len( go_genes['genes']) enrichment['features_of_interest_in_go_term'][ go_term] = ','.join(features_in_go) enrichment['features_of_interest_in_go_term_gene_symbols'][ go_term] = ','.join(symbols) enrichment['go_domain'][go_term] = go_genes['domain'] enrichment['go_name'][go_term] = go_genes['name'] enrichment_df = pd.DataFrame(enrichment) # Bonferonni correction enrichment_df['bonferonni_corrected_p_value'] = \ enrichment_df.p_value * enrichment_df.shape[0] ind = enrichment_df['bonferonni_corrected_p_value'] < p_value_cutoff enrichment_df = enrichment_df.ix[ind] true_enrichment_df = enrichment_df.sort(columns=['p_value']) pdt.assert_frame_equal(test_enrichment_df, true_enrichment_df)
def GO_enrichment(geneList, ontology, expressedGenes = None, printIt=False, pCut = 1000000, xRef = {}): geneList = list(geneList) expressedGenes = list(expressedGenes) lenAllGenes, lenTheseGenes = len(expressedGenes), len(geneList) pValues = defaultdict() nCmps = 0 for GOTerm, GOGenes in ontology.items(): inBoth = GOGenes['genes'].intersection(geneList) expressedGOGenes = GOGenes['genes'].intersection(expressedGenes) if len(inBoth) <= 3 or len(expressedGOGenes) < 5: pValues[GOTerm] = 'notest' continue pVal = hypergeom.sf(len(inBoth), lenAllGenes, len(expressedGOGenes), lenTheseGenes) if pVal < 0: pVal = 0 symbols = [] for ensg in inBoth: if ensg in xRef: symbols.append(xRef[ensg]) else: symbols.append(ensg) pValues[GOTerm] = (pVal, len(inBoth), len(expressedGOGenes), len(GOGenes['genes']), inBoth, symbols) for k, v in pValues.items(): try: pValues[k][0] = v * float(nCmps) #bonferroni correction except: pass import operator y = [] sorted_x = sorted(pValues.iteritems(), key=operator.itemgetter(1)) for k, v in sorted_x: if v == "notest": continue if not type(k) == str: continue try: if v[0] > pCut: continue if printIt: [k, "|".join(ontology[k]['name']), v[0], v[1], v[2], v[3], ",".join(v[4]), ",".join(v[5])] #print k, "|".join(ontology[k]['name']), "%.3e" %v[0], v[1], v[2], v[3], "|".join(v[3]) y.append([k, "|".join(ontology[k]['name']), v[0], v[1], v[2], v[3], ",".join(v[4]), ",".join(v[5])]) except: pass try: df = pd.DataFrame(y, columns=['GO Term ID', 'GO Term Description', 'Bonferroni-corrected Hypergeometric p-Value', 'N Genes in List and GO Category', 'N Expressed Genes in GO Category', 'N Genes in GO category', 'Ensembl Gene IDs in List', 'Gene symbols in List']) df.set_index('GO Term ID', inplace=True) except: df = pd.DataFrame(None, columns=['GO Term ID', 'GO Term Description', 'Bonferroni-corrected Hypergeometric p-Value', 'N Genes in List and GO Category', 'N Expressed Genes in GO Category', 'N Genes in GO category', 'Ensembl Gene IDs in List', 'Gene symbols in List']) return df
def compute_p( i, M, N ): #print i z = i.counts # n black balls in draw n = i.all_counts # num black balls tot M = M #M = M - n # n white balls N = N # num drawn prb = hypergeom.sf(z, M, n, N) return prb
def my_static_result(my_v, my_static_genes, my_gene_set): N = my_v.size n = len(my_static_genes) K = my_gene_set.size selected_genes = sorted(my_static_genes & my_gene_set.genes) k = len(selected_genes) pval = hypergeom.sf(k - 1, N, K, n) result = StaticGSEResult(my_gene_set, N, n, selected_genes, pval) return result
def computeP(self,db,length,totwords): # Computes the p value using hypergeometric distribution # Also finds and assigns the total word count from the database # Word count database must be built by 'totalWordCounts' function # db - dictionary of the words and how often they occur in the entire database # length - number of words in all of the genes in the query set # totwords - total words in the entire database self.total = db[self.word] self.totwords = totwords self.length = length self.p = hypergeom.sf((self.freq-1),self.totwords,self.total,self.length)
def getPvalHypergeom(go,tot1s,totGenes): hit=0;tot=0 for gene in GOs[go]: try: hit+=scores[gene] tot+=1 except KeyError: fail=True stat=hypergeom.sf(int(hit),totGenes,tot1s,tot,loc=0)
def compute_p(i, M, N): """ computes the p-value for a given row in a data frame containing the columns "counts" and "all_counts" parameters: i: data entry M: total number of entries of this data type N: size of the sub set to compute the p value on (number drawn) """ z = i.counts # n black balls in draw n = i.all_counts # num black balls tot # M = M - n # n white balls (was by Aaron, unused) return hypergeom.sf(z, M, n, N)
def test_set_hypergeom(selected_genes, all_genes, set_genes): # Reduce the gene_set to the universe of all_genes, # as we can only sample from this set. all_genes = set(all_genes) set_genes = set(set_genes).intersection(all_genes) selected_genes = set(selected_genes).intersection(all_genes) # Calculate overlap of the gene_set with selected genes. selected_set_genes = set_genes.intersection(selected_genes) # Calculate p-value using the hyper-geometric test. p_val = hypergeom.sf(M=len(all_genes), n=len(set_genes), N=len(selected_genes), k=len(selected_set_genes), loc=1) return p_val
def test_no_enrichment(self, gene_ontology): features_of_interest = gene_ontology.all_genes[:2] test_enrichment_df = gene_ontology.enrichment(features_of_interest) domains = gene_ontology.domains min_feature_size = 3 min_background_size = 5 cross_reference = {} background = gene_ontology.all_genes n_all_genes = len(background) n_features_of_interest = len(features_of_interest) enrichment = defaultdict(dict) for go_term, go_genes in gene_ontology.ontology.items(): if go_genes['domain'] not in domains: continue features_in_go = go_genes['genes'].intersection( features_of_interest) background_in_go = go_genes['genes'].intersection(background) too_few_features = len(features_in_go) < min_feature_size too_few_background = len(background_in_go) < min_background_size if too_few_features or too_few_background: continue # Survival function is more accurate on small p-values p_value = hypergeom.sf(len(features_in_go), n_all_genes, len(background_in_go), n_features_of_interest) p_value = 0 if p_value < 0 else p_value symbols = [cross_reference[f] if f in cross_reference else f for f in features_in_go] enrichment['p_value'][go_term] = p_value enrichment['n_features_of_interest_in_go_term'][go_term] = len( features_in_go) enrichment['n_background_in_go_term'][go_term] = len( background_in_go) enrichment['n_features_total_in_go_term'][go_term] = len( go_genes['genes']) enrichment['features_of_interest_in_go_term'][ go_term] = ','.join(features_in_go) enrichment['features_of_interest_in_go_term_gene_symbols'][ go_term] = ','.join(symbols) enrichment['go_domain'][go_term] = go_genes['domain'] enrichment['go_name'][go_term] = go_genes['name'] true_enrichment_df = pd.DataFrame(enrichment) assert true_enrichment_df.empty assert test_enrichment_df is None
def test(module, annots_dict, inverse_annots_dict, mode='standard'): """Use the hypergeometric test on functions in a gene module. The hypergeometric test is also known as Fisher's exact test. Parameters ---------- module : [string] The list of genes in a module. annots_dict : {string: [string]} dictionary A mapping of genes to functions inverse_annots_dict : {string: [string]} dictionary A mapping of functions to genes mode : {'standard', 'conditional'}, optional Whether to use the standard hypergeometric test or the conditional one (default: standard). Returns ------- d : {string: float} dictionary A mapping of functions to p-values. """ represented_functions = unions([annots_dict[gene] for gene in module]) d = {} num_genes = len(annots_dict) num_drawn = len(module) for function in represented_functions: num_labeled_total = len(inverse_annots_dict[function]) num_labeled_in_module = sum( [function in annots_dict[gene] for gene in module]) d[function] = hypergeom.sf(num_labeled_in_module - 1, num_genes, num_labeled_total, num_drawn) if mode.startswith('c'): d[function] /= hypergeom.sf(0, num_genes, num_labeled_total, num_drawn) return d
def enrich(inputgenes,backgroundgenes,dbfilename,verbose=False,returnn=20): """ enrich perform hypergeometric testing of a set of genes drawn from a background against gene sets in a gmt file. "P values" are the (hypergeometric) probability that at least as many genes from each pathway were obseved as occured in the input set. Ie 1-CFF(n-1) where n is the number of genes observed. Arguments: inputgenes: a numpy.array of gene symbols representing the set to be analized backgroundgenes: a numpy.array of gene symbols representing the background from which the set was drawn dbfilename: the filename of a gmt file (available at http://www.broadinstitute.org/gsea/downloads.js) containting the sets to be enriched against verbose=False:If true print output to standard out returnn=20:return at most this many sets Returns: An array of arrays where each iner array contains the name, link and p value of a pathway. Entries are sorted by p value in ascending order. Example [["name","http://link",.0001] ] """ genes=np.unique(inputgenes) background = np.unique(backgroundgenes) ntrys = len(genes) total= len(background) gmtDB = open(dbfilename) names =[] links =[] probs =[] for line in gmtDB: vs=line.rstrip().split("\t") setgenes=np.array(vs[2:]) nfound = np.sum(np.in1d(genes,setgenes,assume_unique=True)) if nfound > 1: npresent = np.sum(np.in1d(setgenes,background,assume_unique=True)) prob = hypergeom.sf(nfound-1,total,npresent,ntrys) names.append(vs[0]) links.append(vs[1]) probs.append(prob) if verbose: print "\t".join([vs[0],vs[1],str(prob)]) gmtDB.close() sortedarray = [] for i in np.argsort(np.array(probs))[0:returnn]: sortedarray.append([names[i],links[i],probs[i]]) return sortedarray
def enrichment(self,gene_list,pval_cutoff=0.05,gene_filter=None,label=None,max_term_size=300): # extract possible terms for genes if label: self.log("Caculating Enrichemnt for {}",label) cur = self.db.cursor() terms = [ x[0] for x in cur.execute( '''SELECT DISTINCT(term) FROM gene_terms WHERE gene IN ('{}');'''.format("','".join([x.id for x in gene_list])) )] # compute hypergeometric for each term enrichment = [] for id in terms: try: (id,name,type,desc) = cur.execute("SELECT * FROM terms WHERE id = ?",(id,)).fetchone() except TypeError as e: self.log("No information for ontology term {}",id) genes_in_term = [x[0] for x in cur.execute( '''SELECT gene FROM gene_terms WHERE term = ?''',(id,)) ] if len(genes_in_term) > max_term_size: self.log("Skipping {} due to size ({})",name,len(genes_in_term)) continue if gene_filter: genes_in_term = [gene for gene in genes_in_term if gene in gene_filter] num_genes_in_term = len(genes_in_term) overlap = set(genes_in_term).intersection(set([x.id for x in gene_list])) num_genes_total, = cur.execute('SELECT COUNT(DISTINCT(gene)) FROM gene_terms;').fetchone() pval = hypergeom.sf(len(overlap)-1,num_genes_total,num_genes_in_term,len(gene_list)) term_genes = ",".join(overlap) enrichment.append( (id,name,pval,num_genes_in_term,len(overlap),len(gene_list),num_genes_total,type,term_genes,desc) ) try: enrichment = DataFrame(enrichment, columns = ['TermID','Name','pval','LenTerm','LenOverlap','LenList','LenTotal','Type','TermGenes','Desc'] ).sort('pval',ascending=True) enrichment.index = enrichment.TermID except ValueError as e: self.log("No enrichment for {}",",".join([x.id for x in gene_list])) return DataFrame() if label: enrichment['Label'] = label return enrichment[enrichment.pval <= pval_cutoff]
def calculate_pvalues(G, query, min_hit_size=2, min_category_size=3, max_category_size=500, max_category_depth=5, **kwargs): """ calculate pvalues for all categories in the graph :param G: ontology graph after background was set :param query: array_like of identifiers :param min_hit_size: minimum intersection size of query and category :param min_category_size: categories smaller than this number are ignored :param max_category_size: categories larger than this number are ignored :returns: dictionary of term : pvalue """ query_set = set(query) pvalues = {} N = len(query_set) for i in G: node = G.node[i] # reset all query related attributes for attr in ['query', 'n', 'N', 'hits', 'x', 'p', 'q', 'significant']: if attr in node: del node[attr] background = node.get('background', set([])) n = len(background) node['n'] = n hits = query_set.intersection(background) x = len(hits) depth = node.get('depth', -1) # depth might not be set due to malformed ontology if ((depth > max_category_depth) or (n < min_category_size) or (n > max_category_size) or (x < min_hit_size)): continue else: node['query'] = query_set node['N'] = N node['hits'] = hits node['x'] = x M, n = node['M'], node['n'] p = hypergeom.sf(x, M, n, N) node['p'] = p pvalues[i] = p return pvalues
def hypergeometric_test(x, M, n, N): """ The hypergeometric distribution models drawing objects from a bin. - M is total number of objects - n is total number of Type I objects. - x (random variate) represents the number of Type I objects in N drawn without replacement from the total population - http://en.wikipedia.org/wiki/Hypergeometric_distribution - https://www.biostars.org/p/66729/ - http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.hypergeom.html - http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.hypergeometric.html - http://stackoverflow.com/questions/6594840/what-are-equivalents-to-rs-phyper-function-in-python """ assert n <= M assert x <= n assert N <= M pv_le = hypergeom.cdf(x+1, M, n, N) pv_gt = hypergeom.sf(x-1, M, n, N)# 1-cdf sometimes more accurate return pv_le, pv_gt
def test_cross(): """Compares p-values calculated using PVAL1 and PVAL2.""" N = 50 K = 10 #tol = 1e-11 tol = 1e-8 W = N-K table = np.empty((K+1, W+1), dtype=np.longdouble) # calculate hypergeometric p-values for all configurations configs = np.ones((K+1, W+1), dtype=np.float64) for k in range(1, K+1): for w in range(W): n = k+w configs[k, w] = hypergeom.sf(k-1, N, K, n) tests = 0 for X in range(1, N+1): for L in range(N, 0, -1): # calculate all possible XL-mHG test statistics S = np.ones((K+1, W+1), dtype=np.float64) for n in range(L+1): k = min(K, n) w = n-k while k >= X and w <= W and n <= L: S[k, w] = configs[k, w] k -= 1 w += 1 all_stat = np.sort(np.unique(S.ravel()))[::-1] for stat in all_stat: pval1 = mhg_cython.get_xlmhg_pval1(N, K, X, L, stat, table) pval2 = mhg_cython.get_xlmhg_pval2(N, K, X, L, stat, table) tests += 1 assert mhg.is_equal(pval1, pval2, tol=tol) print('Calculated %d bounds, based on %d configurations.' %(tests, configs.size))
def copies_in_opening_hand(self, deck, hand_size=7): question_string = "How likely is it that at least one copy of {card} will be in your opening hand?" answer_suffix = 'percent' chosen_card = random.choice(deck.decklist) copies = chosen_card.count deck_size = sum([ c.count for c in deck.decklist ]) opening_hand_chance = hypergeom.sf(1, deck_size, copies, hand_size) # Consult docs or Stack Overflow: what's that first parameter mean # again? Thank goodness I gave the rest meaningful variable names. # http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.hypergeom.html opening_hand_chance = opening_hand_chance * 100 correct_string = "{:.2f}".format(opening_hand_chance) wrongs = self.gen_wrong(opening_hand_chance, 'percent', 4) possible = wrongs + [correct_string] random.shuffle(possible) print "Chance of a copy of {} in opening hand: {}".format(chosen_card.name, correct_string) return question_string.format(card=chosen_card.name), correct_string, possible, answer_suffix, chosen_card
def post(self): background = defaultbackground try: args = tornado.escape.json_decode(self.request.body) except ValueError: self.clear() self.set_status(400) self.finish("<html><head><title>400 Bad Request</title></head><body>Malformed JSON object in POST body.</body></html>") if "background" in args and len(args["background"])!=0: background = np.unique(args["background"]) total= len(background) replydata = {} for setname in args["lists"]: genes = np.unique(args["lists"][setname]) ntrys = len(genes) names =[] links =[] probs =[] for pathway in pathways: setgenes=pathway["genes"] nfound = np.sum(np.in1d(genes,setgenes,assume_unique=True)) if nfound > 1: npresent = np.sum(np.in1d(setgenes,background,assume_unique=True)) prob = hypergeom.sf(nfound-1,total,npresent,ntrys) names.append(pathway["name"]) links.append(pathway["link"]) probs.append(prob) sortedarray = [] #for i in np.argsort(np.array(probs))[0:returnn]: for i in np.argsort(np.array(probs)): sortedarray.append({"name":names[i],"link":links[i],"p":probs[i]}) replydata[setname]=sortedarray json.dump({'results': replydata}, self)
def fit(self, df_X, df_y): if not df_y.shape[0] == df_X.shape[0]: raise ValueError("number of regions is not equal") if df_y.shape[1] != 1: raise ValueError("y needs to have 1 label column") if set(df_X.dtypes) != set([np.dtype(int)]): raise ValueError("need motif counts, not scores") # calculate hypergeometric p-values pvals = [] clusters = df_y[df_y.columns[0]].unique() M = df_X.shape[0] for cluster in clusters: pos = df_X[df_y.iloc[:,0] == cluster] neg = df_X[df_y.iloc[:,0] != cluster] pos_true = (pos > 0).sum(0) pos_false = (pos == 0).sum(0) neg_true = (neg > 0).sum(0) p = [] for pt, pf, nt in zip(pos_true, pos_false, neg_true): n = pt + nt N = pt + pf x = pt - 1 p.append(hypergeom.sf(x, M, n, N)) pvals.append(p) # correct for multipe testing pvals = np.array(pvals) fdr = multipletests(pvals.flatten(), method="fdr_bh")[1].reshape(pvals.shape) # create output DataFrame self.act_ = pd.DataFrame(-np.log10(pvals.T), columns=clusters, index=df_X.columns)
def hypergeometric_p_value(n_unique_nodes, intsec_card, sources, targets): """Work in progress! """ from scipy.stats import hypergeom us = n_unique_nodes[sources] ut = n_unique_nodes[targets] # population size M = 220*220 # number of success states in population n = np.vstack((us, ut)).max(axis=0) # total draws N = np.vstack((us, ut)).min(axis=0) # successes x = intsec_card hg_p = np.zeros(len(sources)) for i in range(len(sources)): hg_p[i] = hypergeom.sf(x[i], M, n[i], N[i]) return hg_p
def genpoints(prefix,frag): '''compute the fpr/tpr frome all the shape constraint search files in this directory with given prefix and frag''' acntf = 'active.%s.cnt' % frag dcntf = 'decoy.%s.cnt' % frag checkfiles([acntf,dcntf]) numactives = float(open(acntf).read()) numdecoys = float(open(dcntf).read()) pts = list() cnt = 0 files = {} for afile in glob.glob('%s.*.%s.actives*.out' % (prefix,frag)): cnt += 1.0 dfile = afile.replace('actives','decoys') checkfiles([dfile]) na = sum([1.0 for line in open(afile)]) nd = sum([1.0 for line in open(dfile)]) fpr = nd/numdecoys tpr = na/numactives if fpr > 1 or tpr > 1: print "BAD FPR/TPR: %.2f %2f %s %s" % (fpr,tpr,afile,dfile) if na == 0: pval = 1 else: pval = hypergeom.sf(na-1,numdecoys+numactives,na,nd+na) pts.append((fpr,tpr, pval)) files[(fpr,tpr)] = afile #print afile,fpr,tpr, pval #bonferroni correction pts = np.array(list(pts)) pts[:,2] *= cnt return pts, files
def copies_in_top_five(self, deck): """ Another difficult question - but also somewhat difficult to code, since it requires that we pick a bunch of cards that have already left the deck. Well, it would require that for the serious version. For this version - just use a scalar! """ question_string = "After drawing your opening hand with one copy of {card}, how likely is it that another copy of {card} is in the top five cards of your deck?" answer_suffix = 'percent' # That's another reason why we don't choose a card earlier: we might be # interested in a card with a specific quality. chosen_card = random.choice([ card for card in deck.decklist if card.count > 1 ]) remaining_copies = chosen_card.count - 1 remaining_deck = sum([c.count for c in deck.decklist]) - 7 in_top_five_chance = hypergeom.sf(1, remaining_deck, remaining_copies, 5) in_top_five_chance = in_top_five_chance * 100 correct_string = "{:.2f}".format(in_top_five_chance) wrongs = self.gen_wrong(in_top_five_chance, 'percent', 4) possible = wrongs + [correct_string] random.shuffle(possible) print "Chance of a copy of {} in the next five cards: {}".format(chosen_card.name, correct_string) return question_string.format(card=chosen_card.name), correct_string, possible, answer_suffix, chosen_card
def get_static_enrichment( self, genes, pval_thresh, adjust_pval_thresh=True, K_min=3, gene_set_ids=None): """Find enriched gene sets in a set of genes. Parameters ---------- genes : set of str The set of genes to test for gene set enrichment. pval_thresh : float The significance level (p-value threshold) to use in the analysis. adjust_pval_thresh : bool, optional Whether to adjust the p-value threshold using a Bonferroni correction. (Warning: This is a very conservative correction!) [True] K_min : int, optional The minimum number of gene set genes present in the analysis. [3] gene_set_ids : Iterable or None A list of gene set IDs to test. If ``None``, all gene sets are tested that meet the :attr:`K_min` criterion. Returns ------- list of `StaticGSEResult` A list of all significantly enriched gene sets. """ assert isinstance(genes, set) assert isinstance(pval_thresh, (float, np.float)) assert isinstance(K_min, (int, np.integer)) if gene_set_ids is not None: assert isinstance(gene_set_ids, Iterable) gene_set_coll = self._gene_set_coll gene_sets = self._gene_set_coll.gene_sets gene_memberships = self._gene_memberships sorted_genes = sorted(genes) # test only some terms? if gene_set_ids is not None: gs_indices = np.int64([self._gene_set_coll.index(id_) for id_ in gene_set_ids]) gene_sets = [gene_set_coll[id_] for id_ in gene_set_ids] # gene_set_coll = GeneSetCollection(gene_sets) gene_memberships = gene_memberships[:, gs_indices] # not a view! # determine K's K_vec = np.sum(gene_memberships, axis=0, dtype=np.int64) # exclude terms with too few genes sel = np.nonzero(K_vec >= K_min)[0] K_vec = K_vec[sel] gene_sets = [gene_sets[j] for j in sel] gene_memberships = gene_memberships[:, sel] # determine k's, ignoring unknown genes unknown = 0 sel = [] filtered_genes = [] logger.debug('Looking up indices for %d genes...', len(sorted_genes)) for i, g in enumerate(sorted_genes): assert isinstance(g, (str, _oldstr)) try: idx = self._genome.index(g) except ValueError: unknown += 1 else: sel.append(idx) filtered_genes.append(g) sel = np.int64(sel) gene_indices = np.int64(sel) # gene_memberships = gene_memberships[sel, :] k_vec = np.sum(gene_memberships[sel, :], axis=0, dtype=np.int64) if unknown > 0: logger.warn('%d / %d unknown genes (%.1f %%), will be ignored.', unknown, len(genes), 100 * (unknown / float(len(genes)))) # determine n and N n = len(filtered_genes) N, m = gene_memberships.shape logger.info('Conducting %d tests.', m) # correct p-value threshold, if specified final_pval_thresh = pval_thresh if adjust_pval_thresh: final_pval_thresh /= float(m) logger.info('Using Bonferroni-corrected p-value threshold: %.1e', final_pval_thresh) # calculate p-values and get significantly enriched gene sets enriched = [] logger.debug('N=%d, n=%d', N, n) sys.stdout.flush() for j in range(m): pval = hypergeom.sf(k_vec[j] - 1, N, K_vec[j], n) if pval <= final_pval_thresh: # found significant enrichment # sel_genes = [filtered_genes[i] for i in np.nonzero(gene_memberships[:, j])[0]] sel_genes = [self._genome[i] for i in np.nonzero(gene_memberships[gene_indices, j])[0]] enriched.append( StaticGSEResult(gene_sets[j], N, n, set(sel_genes), pval)) return enriched
def enrichment(self, features_of_interest, background=None, p_value_cutoff=1000000, cross_reference=None, min_feature_size=3, min_background_size=5, domain=None): """Bonferroni-corrected hypergeometric p-values of GO enrichment Calculates hypergeometric enrichment of the features of interest, in each GO category. Parameters ---------- features_of_interest : list-like List of features. Must match the identifiers in the ontology database exactly, i.e. if your ontology database is ENSEMBL ids, then you can only provide those and not common names like "RBFOX2" background : list-like, optional Background genes to use. It is best to use a relevant background such as all expressed genes. If None, defaults to all genes. p_value_cutoff : float, optional Maximum accepted Bonferroni-corrected p-value cross_reference : dict-like, optional A mapping of gene ids to gene symbols, e.g. a pandas Series of ENSEMBL genes e.g. ENSG00000139675 to gene symbols e.g HNRNPA1L2 min_feature_size : int, optional Minimum number of features of interest overlapping in a GO Term, to calculate enrichment min_background_size : int, optional Minimum number of features in the background overlapping a GO Term domain : str or list, optional Only calculate GO enrichment for a particular GO category or subset of categories. Valid domains: 'biological_process', 'molecular_function', 'cellular_component' Returns ------- enrichment_df : pandas.DataFrame A (n_go_categories, columns) DataFrame of the enrichment scores Raises ------ ValueError If features of interest and background do not overlap, or invalid GO domains are given """ cross_reference = {} if cross_reference is None else cross_reference background = self.all_genes if background is None else background if len(set(background) & set(features_of_interest)) == 0: raise ValueError('Features of interest and background do not ' 'overlap! Not calculating GO enrichment') if len(set(features_of_interest) & set(self.all_genes)) == 0: raise ValueError('Features of interest do not overlap with GO term' 'gene ids. Not calculating GO enrichment.') domains = self.domains valid_domains = ",".join("'{}'".format(x) for x in self.domains) if isinstance(domain, str): if domain not in self.domains: raise ValueError( "'{}' is not a valid GO domain. " "Only {} are acceptable".format(domain, valid_domains)) domains = frozenset([domain]) elif isinstance(domain, Iterable): if len(set(domain) & self.domains) == 0: raise ValueError( "'{}' are not a valid GO domains. " "Only {} are acceptable".format( ",".join("'{}'".format(x) for x in domain), valid_domains)) domains = frozenset(domain) n_all_genes = len(background) n_features_of_interest = len(features_of_interest) enrichment = defaultdict(dict) for go_term, go_genes in self.ontology.items(): if go_genes['domain'] not in domains: continue features_in_go = go_genes['genes'].intersection( features_of_interest) background_in_go = go_genes['genes'].intersection(background) too_few_features = len(features_in_go) < min_feature_size too_few_background = len(background_in_go) < min_background_size if too_few_features or too_few_background: continue # Survival function is more accurate on small p-values p_value = hypergeom.sf(len(features_in_go), n_all_genes, len(background_in_go), n_features_of_interest) p_value = 0 if p_value < 0 else p_value symbols = [cross_reference[f] if f in cross_reference else f for f in features_in_go] enrichment['p_value'][go_term] = p_value enrichment['n_features_of_interest_in_go_term'][go_term] = len( features_in_go) enrichment['n_background_in_go_term'][go_term] = len( background_in_go) enrichment['n_features_total_in_go_term'][go_term] = len( go_genes['genes']) enrichment['features_of_interest_in_go_term'][ go_term] = ','.join(features_in_go) enrichment['features_of_interest_in_go_term_gene_symbols'][ go_term] = ','.join(symbols) enrichment['go_domain'][go_term] = go_genes['domain'] enrichment['go_name'][go_term] = go_genes['name'] enrichment_df = pd.DataFrame(enrichment) if enrichment_df.empty: warnings.warn('No GO categories enriched in provided features') return # Bonferonni correction enrichment_df['bonferonni_corrected_p_value'] = \ enrichment_df.p_value * enrichment_df.shape[0] ind = enrichment_df['bonferonni_corrected_p_value'] < p_value_cutoff enrichment_df = enrichment_df.ix[ind] enrichment_df = enrichment_df.sort(columns=['p_value']) return enrichment_df
def main(): usage = 'usage: %prog [options] <peaks gff> <diff>' parser = OptionParser(usage) parser.add_option('-c', dest='clip_fpkm_file', help='Control FPKM tracking file') parser.add_option('-g', dest='ref_gtf', default='%s/gencode.v18.annotation.gtf'%os.environ['GENCODE']) parser.add_option('--ggplot', dest='ggplot_script', default='%s/peaks_diff_compare.r'%os.environ['RDIR'], help='Script to make plots with [Default: %default]') parser.add_option('-m', dest='max_stat', default=10, type='float', help='Max cuffdiff stat [Default: %default]') parser.add_option('-o', dest='output_pre', default='', help='Output prefix [Default: %default]') parser.add_option('-r', dest='rbp', default='RBP', help='RBP name [Default: %default]') parser.add_option('-s', dest='single_gene_loci', default=False, action='store_true', help='Only use single gene loci [Default: %default]') parser.add_option('-t', dest='test_stat', default=False, action='store_true', help='Use test statistic rather than fold change [Default: %default]') parser.add_option('--sample1', dest='sample1', help='Sample_1 name in cuffdiff') parser.add_option('--sample2', dest='sample2', help='Sample_2 name in cuffdiff') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide peaks GFF and .diff file') else: peaks_gff = args[0] diff_file = args[1] ################################################## # process GTF ################################################## if options.single_gene_loci: single_gtf_fd, single_gtf_file = filter_single(options.ref_gtf) options.ref_gtf = single_gtf_file gtf_genes = gff.gtf_gene_set(options.ref_gtf) ################################################## # collect CLIP peak bound genes ################################################## peak_genes = set() p = subprocess.Popen('intersectBed -s -u -a %s -b %s' % (options.ref_gtf, peaks_gff), shell=True, stdout=subprocess.PIPE) for line in p.stdout: peak_genes.add(gff.gtf_kv(line.split('\t')[8])['gene_id']) p.communicate() # find expressed genes in peak calls silent_genes = set() if options.clip_fpkm_file: silent_genes = find_silent(options.clip_fpkm_file) ################################################## # collect RIP stats ################################################## if options.test_stat: rip_fold, rip_bound = ripseq.hash_rip(diff_file, just_ok = True, use_fold=False, max_stat=options.max_stat, one_rbp=True) else: rip_fold, rip_bound = ripseq.hash_rip(diff_file, use_fold=True, max_stat=options.max_stat, one_rbp=True) rip_fold = ripseq.hash_rip_fold(diff_file, min_fpkm=0.125, max_fold=10, one_rbp=True) ################################################## # plot bound and unbound distributions ################################################## # construct data frame df_dict = {'Gene':[], 'CLIP':[], 'RIP':[]} for gene_id in rip_fold: if gene_id in gtf_genes and (len(silent_genes) == 0 or gene_id not in silent_genes): df_dict['Gene'].append(gene_id) df_dict['RIP'].append(rip_fold[gene_id]) if gene_id in peak_genes: df_dict['CLIP'].append('Bound') else: df_dict['CLIP'].append('Unbound') ggplot.plot(options.ggplot_script, df_dict, [options.output_pre, options.rbp, options.test_stat]) ################################################## # compute stats on bound and unbound distributions ################################################## bound_fold = [df_dict['RIP'][i] for i in range(len(df_dict['RIP'])) if df_dict['CLIP'][i] == 'Bound'] unbound_fold = [df_dict['RIP'][i] for i in range(len(df_dict['RIP'])) if df_dict['CLIP'][i] == 'Unbound'] # perform statistical test z, p = stats.mannwhitneyu(bound_fold, unbound_fold) stats_out = open('%s_stats.txt' % options.output_pre, 'w') cols = (options.rbp, len(bound_fold), stats.mean(bound_fold), len(unbound_fold), stats.mean(unbound_fold), z, p) print >> stats_out, '%-10s %5d %6.2f %5d %6.2f %6.2f %9.2e' % cols stats_out.close() ################################################## # plot venn diagram ################################################## rip_genes = set([df_dict['Gene'][i] for i in range(len(df_dict['Gene'])) if rip_bound.get(df_dict['Gene'][i],False)]) clip_only = len(peak_genes - rip_genes) rip_only = len(rip_genes - peak_genes) both = len(peak_genes & rip_genes) if options.clip_fpkm_file: print >> sys.stderr, 'Ignoring silent genes for hypergeometric test' # k is x # K is n # N is M # n is N # hypergeom.sf(x, M, n, N, loc=0) p1 = hypergeom.sf(both-1, len(gtf_genes), len(peak_genes), len(rip_genes)) p2 = hypergeom.sf(both-1, len(gtf_genes), len(rip_genes), len(peak_genes)) hyper_out = open('%s_hyper.txt' % options.output_pre, 'w') cols = (p1, p2, both, clip_only, rip_only, len(peak_genes), len(rip_genes), len(gtf_genes)) print >> hyper_out, '%7.2e %7.2e %5d %5d %5d %5d %5d %5d' % cols hyper_out.close() if clip_only > 0 and rip_only > 0: plt.figure() # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#377eb8']) # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#1ae47d']) venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#A1A838']) plt.savefig('%s_venn.pdf' % options.output_pre) ################################################## # clean ################################################## if options.single_gene_loci: os.close(single_gtf_fd) os.remove(single_gtf_file)