def significance_assessment(self, cscPairA, cscPairD, leftregion, rightregion, meta_chrome, arm, AmpPat, DelPat, chrm_genebkt): if len(cscPairA.keys()) != 0 or len(cscPairD.keys()) != 0: scorelistA, scorelistD = [], [] for i in range(0, self.num_permutation): permute_regionA, permute_regionD = cna_utils.cycle_shift_permutation(self.dlcall.regionA[meta_chrome][arm], self.dlcall.regionD[meta_chrome][arm], leftregion, rightregion) pedgesetA, pedgesetD, pedgetoPatient, pedgewA, pedgewD, pposA, pposD = cna_utils.formatEdgeId(AmpPat.union(DelPat), permute_regionA, permute_regionD)#, abbA, abbD) pcscPairA, pcscPairD = self.RAIG_algo(pedgesetA, pedgesetD, pedgetoPatient, pedgewA, pedgewD, pposA, pposD, chrm_genebkt, len(AmpPat), len(DelPat)) if len(pcscPairA.keys()) != 0: scorelistA.append(max([2*min(pcscPairA[cid]['lcount'],pcscPairA[cid]['rcount']) for cid in pcscPairA.keys()])) else: scorelistA.append(0) if len(pcscPairD.keys()) != 0: scorelistD.append(max([2*min(pcscPairD[cid]['lcount'],pcscPairD[cid]['rcount']) for cid in pcscPairD.keys()])) else: scorelistD.append(0) if len(cscPairA.keys()) != 0: pvals = list() cidlist = list() for cid in cscPairA.keys(): csc_score = 2*min(cscPairA[cid]['lcount'],cscPairA[cid]['rcount']) count = 0 for s in scorelistA: if s > csc_score: count += 1 cscPairA[cid]['p-val'] = float(count)/self.num_permutation pvals.append(float(count)/self.num_permutation) cidlist.append(cid) corrected_pval = smm.multipletests(pvals, alpha=0.05, method='fdr_bh')[1] for i in range(len(cidlist)): cscPairA[cidlist[i]]['corrected-p-val'] = corrected_pval[i] if len(cscPairD.keys()) != 0: pvals = list() cidlist = list() for cid in cscPairD.keys(): csc_score = 2*min(cscPairD[cid]['lcount'],cscPairD[cid]['rcount']) count = 0 for s in scorelistD: if s > csc_score: count +=1 cscPairD[cid]['p-val'] = float(count)/self.num_permutation pvals.append(float(count)/self.num_permutation) cidlist.append(cid) corrected_pval = smm.multipletests(pvals, alpha=0.05, method='fdr_bh')[1] for i in range(len(cidlist)): cscPairD[cidlist[i]]['corrected-p-val'] = corrected_pval[i]
def test_issorted(method): # test that is_sorted keyword works correctly # the fdrcorrection functions are tested indirectly # data generated as random numbers np.random.beta(0.2, 0.5, size=10) pvals = np.array([31, 9958111, 7430818, 8653643, 9892855, 876, 2651691, 145836, 9931, 6174747]) * 1e-7 sortind = np.argsort(pvals) sortrevind = sortind.argsort() pvals_sorted = pvals[sortind] res1 = multipletests(pvals, method=method, is_sorted=False) res2 = multipletests(pvals_sorted, method=method, is_sorted=True) assert_equal(res2[0][sortrevind], res1[0]) assert_allclose(res2[0][sortrevind], res1[0], rtol=1e-10)
def get_p_values(dat): #%% feat_x = dat[dat['region']=='Before'] feat_y = dat[dat['region']=='After'] p_values = [] for feat in feat_avg_names: x = feat_x[feat] x = x.dropna() y = feat_y[feat].dropna() if x.size > 0 and y.size > 0: _, p = ttest_ind(x, y) else: p = np.nan p_values.append((feat, p)) feats, p_val = zip(*p_values) p_values = pd.Series(p_val, index=feats).dropna() p_values = p_values.sort_values(ascending=True) if p_values.size > 0: reject, pvals_corrected, alphacSidak, alphacBonf = \ smm.multipletests(p_values.values, method = 'fdr_tsbky') pvals_corrected = pd.Series(pvals_corrected, index=p_values.index) else: pvals_corrected = pd.Series() #%% return p_values, pvals_corrected
def multi_correct(data, meth='fdr_bh'): """ Run fdr correction on nodes of interest contained in an array of p values. Parameters: ----------- data : numpy array nnodes x nnodes array containing p values of correlation between each node noi_idx : numpy indices (applicable to both row and column) of nodes of interest. This reduces the number of nodes corrected for meth : str Method of correction. Options are: `bonferroni` : one-step correction `sidak` : on-step correction `holm-sidak` : `holm` : `simes-hochberg` : `hommel` : `fdr_bh` : Benjamini/Hochberg (default) `fdr_by` : Benjamini/Yekutieli Returns: ---------- fdr_corrected : numpy array array containing p values corrected with fdr """ rej, corrp, alpha_sidak, alpha_bonnf = smm.multipletests(data, alpha=0.05, method=meth) return corrp
def test_hommel(): #tested agains R stats p_adjust(pval0, method='hommel') pval0 = np.array( [ 0.00116, 0.00924, 0.01075, 0.01437, 0.01784, 0.01918, 0.02751, 0.02871, 0.03054, 0.03246, 0.04259, 0.06879, 0.0691 , 0.08081, 0.08593, 0.08993, 0.09386, 0.09412, 0.09718, 0.09758, 0.09781, 0.09788, 0.13282, 0.20191, 0.21757, 0.24031, 0.26061, 0.26762, 0.29474, 0.32901, 0.41386, 0.51479, 0.52461, 0.53389, 0.56276, 0.62967, 0.72178, 0.73403, 0.87182, 0.95384]) result_ho = np.array( [ 0.0464 , 0.25872 , 0.29025 , 0.3495714285714286, 0.41032 , 0.44114 , 0.57771 , 0.60291 , 0.618954 , 0.6492 , 0.7402725000000001, 0.86749 , 0.86749 , 0.8889100000000001, 0.8971477777777778, 0.8993 , 0.9175374999999999, 0.9175374999999999, 0.9175374999999999, 0.9175374999999999, 0.9175374999999999, 0.9175374999999999, 0.95384 , 0.9538400000000001, 0.9538400000000001, 0.9538400000000001, 0.9538400000000001, 0.9538400000000001, 0.9538400000000001, 0.9538400000000001, 0.9538400000000001, 0.9538400000000001, 0.9538400000000001, 0.9538400000000001, 0.9538400000000001, 0.9538400000000001, 0.9538400000000001, 0.9538400000000001, 0.9538400000000001, 0.9538400000000001]) rej, pvalscorr, _, _ = multipletests(pval0, alpha=0.1, method='ho') assert_almost_equal(pvalscorr, result_ho, 15) assert_equal(rej, result_ho < 0.1) #booleans
def DEGI(gctfile,clsfile,number): #open and save input files with open(gctfile) as gct: gct=numpy.genfromtxt(gct,dtype=None,delimiter="\t",missing_values="NA",invalid_raise=False,skip_header=2) gct_exp=gct[1:,2:].astype(float) #matrix of expression values gct_genes=gct[1:,1] #list of gene names with open(clsfile) as label: label=label.read().splitlines() label=label[2].split() #list of class labels #initialize empty list for p-values pvals=[] #first, caluclate difference in means with original labels for i in range(0,len(gct_genes)): class0=[] class1=[] for j in range(0,len(label)): if label[j]=="0": class0.append(gct_exp[i,j]) if label[j]=="1": class1.append(gct_exp[i,j]) mean0=sum(class0)/len(class0) mean1=sum(class1)/len(class1) null_diff=abs(mean0-mean1) #then, calculate difference in means with permutated labels #p-value is determined by the proportion of permutated differences that are less than the original difference greater=0. for k in range(0,number): label_shuffle=numpy.random.permutation(label) class0_shuffle=[] class1_shuffle=[] for j in range(0,len(label_shuffle)): if label_shuffle[j]=="0": class0_shuffle.append(gct_exp[i,j]) if label_shuffle[j]=="1": class1_shuffle.append(gct_exp[i,j]) mean0_shuffle=sum(class0_shuffle)/len(class0_shuffle) mean1_shuffle=sum(class1_shuffle)/len(class1_shuffle) alt_diff=abs(mean0_shuffle-mean1_shuffle) if null_diff>=alt_diff: greater+=1. pvals.append(greater/number) #correct for multiple hypothesis tests using benjamini-hochberg bh=smm.multipletests(pvals,alpha=0.05,method='fdr_bh') bh_sig=bh[0] bh_pvals=bh[1].astype(str) sig=0 for i in range(0,len(bh_sig)): if bh_sig[i]==True: print gct_genes[i]+" is differentially expressed.\nThe adjusted p-value is "+bh_pvals[i]+"\n" sig+=1 if sig==0: print "There are no differentially expressed genes."
def pval_corrected(self, method=None): '''p-values corrected for multiple testing problem This uses the default p-value correction of the instance stored in ``self.multitest_method`` if method is None. ''' import statsmodels.stats.multitest as smt if method is None: method = self.multitest_method #TODO: breaks with method=None return smt.multipletests(self.pvals_raw, method=method)[1]
def test_pvalcorrection_reject(alpha, method, ii): # consistency test for reject boolean and pvalscorr pval1 = np.hstack((np.linspace(0.0001, 0.0100, ii), np.linspace(0.05001, 0.11, 10 - ii))) # using .05001 instead of 0.05 to avoid edge case issue #768 reject, pvalscorr = multipletests(pval1, alpha=alpha, method=method)[:2] msg = 'case %s %3.2f rejected:%d\npval_raw=%r\npvalscorr=%r' % ( method, alpha, reject.sum(), pval1, pvalscorr) assert_equal(reject, pvalscorr <= alpha, err_msg=msg)
def test_multi_pvalcorrection_rmethods(self, key, val): # test against R package multtest mt.rawp2adjp res_multtest = self.res2 pval0 = res_multtest[:, 0] if val[1] in self.methods: reject, pvalscorr = multipletests(pval0, alpha=self.alpha, method=val[1])[:2] assert_almost_equal(pvalscorr, res_multtest[:, val[0]], 15) assert_equal(reject, pvalscorr <= self.alpha)
def get_score_df(self, correction_method=None): ''' :param correction_method: str or None, correction method from statsmodels.stats.multitest.multipletests 'fdr_bh' is recommended. :return: pd.DataFrame ''' # From https://people.kth.se/~lang/Effect_size.pdf # Shinichi Nakagawa1 and Innes C. Cuthill. 2007. In Biological Reviews 82. X = self._get_X().astype(np.float64) X = X / X.sum(axis=1) cat_X, ncat_X = self._get_cat_and_ncat(X) n1, n2 = float(cat_X.shape[1]), float(ncat_X.shape[1]) n = n1 + n2 m1 = cat_X.mean(axis=0).A1 m2 = ncat_X.mean(axis=0).A1 v1 = cat_X.var(axis=0).A1 v2 = ncat_X.var(axis=0).A1 s_pooled = np.sqrt(((n2 - 1) * v2 + (n1 - 1) * v1) / (n - 2.)) cohens_d = (m1 - m2) / s_pooled cohens_d_se = np.sqrt(((n - 1.) / (n - 3)) * (4. / n) * (1 + np.square(cohens_d))) cohens_d_z = cohens_d / cohens_d_se cohens_d_p = norm.sf(cohens_d_z) hedges_r = cohens_d * (1 - 3. / ((4. * (n - 2)) - 1)) hedges_r_se = np.sqrt(n / (n1 * n2) + np.square(hedges_r) / (n - 2.)) hedges_r_z = hedges_r / hedges_r_se hedges_r_p = norm.sf(hedges_r_z) score_df = pd.DataFrame({ 'cohens_d': cohens_d, 'cohens_d_se': cohens_d_se, 'cohens_d_z': cohens_d_z, 'cohens_d_p': cohens_d_p, 'hedges_r': hedges_r, 'hedges_r_se': hedges_r_se, 'hedges_r_z': hedges_r_z, 'hedges_r_p': hedges_r_p, 'm1': m1, 'm2': m2, }, index=self.corpus_.get_terms()).fillna(0) if correction_method is not None: from statsmodels.stats.multitest import multipletests score_df['hedges_r_p_corr'] = 0.5 for method in ['cohens_d', 'hedges_r']: score_df[method + '_p_corr'] = 0.5 score_df.loc[(score_df['m1'] != 0) | (score_df['m2'] != 0), method + '_p_corr'] = ( multipletests(score_df.loc[(score_df['m1'] != 0) | (score_df['m2'] != 0), method + '_p'], method=correction_method)[1] ) return score_df
def is_from_null(self,alpha,samples,chane_prob): dims = samples.shape[1] boots = 10*int(dims/alpha) pvals = np.zeros(dims) for dim in range(dims): U,_ = self.tester.get_statistic_multiple_dim(samples,dim) p = self.tester.compute_pvalues_for_processes(U,chane_prob,boots) pvals[dim] = p print(pvals) alt_is_true, pvals_corrected,_,_ = multipletests(pvals,alpha,method='holm') return any(alt_is_true),pvals_corrected
def test_pvalcorrection_reject(): # consistency test for reject boolean and pvalscorr for alpha in [0.01, 0.05, 0.1]: for method in ['b', 's', 'sh', 'hs', 'h', 'hommel', 'fdr_i', 'fdr_n', 'fdr_tsbky', 'fdr_tsbh', 'fdr_gbs']: for ii in range(11): pval1 = np.hstack((np.linspace(0.0001, 0.0100, ii), np.linspace(0.05001, 0.11, 10 - ii))) # using .05001 instead of 0.05 to avoid edge case issue #768 reject, pvalscorr = multipletests(pval1, alpha=alpha, method=method)[:2] #print 'reject.sum', v[1], reject.sum() msg = 'case %s %3.2f rejected:%d\npval_raw=%r\npvalscorr=%r' % ( method, alpha, reject.sum(), pval1, pvalscorr) assert_equal(reject, pvalscorr <= alpha, err_msg=msg)
def correct_enrichment_pvalues(enrichments, method, sig_cutoff): corrected_enrichments = [] for enrichment in enrichments: pvalues = enrichment.values() gene_set_names = enrichment.keys() if method == 'none' or method is None: corrected_pvalues = pvalues reject = pvalues > sig_cutoff else: reject, corrected_pvalues, _, _ = smm.multipletests(pvalues, alpha=sig_cutoff, method=method) accepted_indices = np.where(reject)[0] accepted_pvalues = dict([(gene_set_names[i], corrected_pvalues[i]) for i in accepted_indices]) corrected_enrichments.append(accepted_pvalues) return corrected_enrichments
def __call__(self, track): print "Reading %s" % track data = pandas.read_csv(self.openFile(track), header=0, names=["contig", "start", "p"], sep="\t") print "Done" data["qvalues"] = multipletests(data["p"], method="fdr_bh")[1] output = dict() output["Bases"] = data.shape[0] output["Significant"] = (data["qvalues"] < 0.01).sum() output["Fraction_Significant"] = \ float(output["Significant"])/output["Bases"] return output
def test_multi_pvalcorrection(): #test against R package multtest mt.rawp2adjp #because of sort this doesn't check correct sequence - TODO: rewrite DONE rmethods = {'rawp':(0,'pval'), 'Bonferroni':(1,'b'), 'Holm':(2,'h'), 'Hochberg':(3,'sh'), 'SidakSS':(4,'s'), 'SidakSD':(5,'hs'), 'BH':(6,'fdr_i'), 'BY':(7,'fdr_n')} for k,v in rmethods.items(): if v[1] in ['b', 's', 'sh', 'hs', 'h', 'fdr_i', 'fdr_n']: #pvalscorr = np.sort(multipletests(pval0, alpha=0.1, method=v[1])[1]) r_sortindex = [6, 8, 9, 7, 5, 1, 2, 4, 0, 3] pvalscorr = multipletests(pval0, alpha=0.1, method=v[1])[1][r_sortindex] assert_almost_equal(pvalscorr, res_multtest[:,v[0]], 15) pvalscorr = np.sort(fdrcorrection(pval0, method='n')[1]) assert_almost_equal(pvalscorr, res_multtest[:,7], 15) pvalscorr = np.sort(fdrcorrection(pval0, method='i')[1]) assert_almost_equal(pvalscorr, res_multtest[:,6], 15)
def test_multi_pvalcorrection(self): #test against R package multtest mt.rawp2adjp res_multtest = self.res2 pval0 = res_multtest[:,0] for k,v in iteritems(rmethods): if v[1] in self.methods: reject, pvalscorr = multipletests(pval0, alpha=self.alpha, method=v[1])[:2] assert_almost_equal(pvalscorr, res_multtest[:,v[0]], 15) assert_equal(reject, pvalscorr <= self.alpha) pvalscorr = np.sort(fdrcorrection(pval0, method='n')[1]) assert_almost_equal(pvalscorr, res_multtest[:,7], 15) pvalscorr = np.sort(fdrcorrection(pval0, method='i')[1]) assert_almost_equal(pvalscorr, res_multtest[:,6], 15)
def compute_q_values(contingencies, bonferroni_count=None): """Compute p and q-values""" logging.info("Computing p and q-values") target_event_pairs = [] p_vals = [] for (target, event), table in contingencies.iteritems(): chi2, pvalue, ddof, expected = stats.chi2_contingency(table) target_event_pairs.append((target, event)) p_vals.append(pvalue) #Calculate the qvalue (p-adjusted FDR) if bonferroni_count: logging.info("Using Bonferroni correction for q-value calculations") q_vals = [pval * float(bonferroni_count) for pval in p_vals] else: logging.info("Using Holm correction for q-value calculations") reject_array, q_vals, alpha_c_sidak, alpha_c_bonf = multipletests( p_vals, alpha=0.05, method='holm') return target_event_pairs, p_vals, q_vals
def test_associations(data, test_types=("two-sided",), threshold=None, corr_method="fdr_bh", associations=None): if associations is None: associations = itertools.combinations(data.columns, 2) row_gen = ( (a, b, test_type, test_association(data[[a, b]], test_type=test_type)) for a, b in associations for test_type in test_types ) frame = pd.DataFrame(row_gen, columns=["a", "b", "test_type", "p_value"]) frame["p_value_adj"] = multipletests(frame["p_value"], method=corr_method)[1] frame.sort_values(by="p_value_adj", inplace=True) if threshold is not None: frame = frame.query("p_value_adj <= {}".format(threshold)) return frame
def calc_kruskal(x, sample_num_l, alpha): tmp_input_l = split_list(x[1:],sample_num_l) #ignore id column try: h,p = stats.kruskal(*tmp_input_l) #run kruskal-wallist test # h,p = stats.f_oneway(*tmp_input_l) except ValueError: return x+['1.00','0'] if math.isnan(p) : return x+['1.00','0'] result = [] if p < alpha : num = len(sample_num_l) pval_l = [] for i in range(num-1): for j in range(i+1, num): tmp_p = 0.0 try: tmp_u, tmp_p = stats.mannwhitneyu(tmp_input_l[i],tmp_input_l[j]) #This is one-sied result except ValueError : tmp_p = 0.5 pval_l.append(tmp_p*2) rej = smm.multipletests(pval_l, alpha=alpha, method='fdr_bh')[0] # fdr correction flag = 1 for i in range(len(rej)): if ~rej[i] : flag = 0 break result = [`p`,`flag`] else: result = [`p`,'0'] return x+result
def main(table_fpath, fdr=.1): pvalues = [] with open(table_fpath) as tables_file: for line in tables_file: if '#' in line: continue spl = line.split('\t') if len(spl) == 5: pvalues.extend(float(x) for x in spl[1:]) pvalues = np.asarray(pvalues) reject = multitest.multipletests(pvalues, fdr, method='fdr_bh')[0] n = reject.shape[0] X = reject.reshape((n // 4, 4))[:, 0:2] P = pvalues.reshape((n // 4, 4))[:, 0:2] for row in P: print(row < .05)
def calcEnrichment(self, method='Fisher', correction='FDR'): if not method in self.__SUPP_METHODS: raise ValueError('\'%s\' is not a supported method' % method) # get the union set of drug properties of any of the foreground drugs db_dict = dict() if method == 'Fisher': p_val = list() odds_r = list() n_r = list() props = list() chemicals = list() # test each property (k) independently for enrichment # (e.g. drug targets with ligand set L in foreground F) # assemble 2x2 contingency table (rows: in F / not in F; cols: in L / not in L) foreground = set(self.fg_score_dict.viewkeys()) not_foreground = self.background.difference(foreground) for k in self.bg_cid_prop_map.viewkeys(): ligands = self.db_prop_cid_map[k] ct_11 = len(foreground.intersection(ligands)) # in F & in L ct_12 = len(foreground.difference(ligands)) # in F & not in L ct_21 = len(not_foreground.intersection(ligands)) # not in F & in L ct_22 = len(not_foreground.difference(ligands)) # not in F & not in L table = [[ct_11, ct_12], [ct_21, ct_22]] o, p = stats.fisher_exact(table) props.append(k) odds_r.append(o) n_r.append(str(ct_11)+'/'+str(ct_11+ct_21)) p_val.append(p) # correct for multiple testing if correction=='FDR': tmp1, p_adj, tmp2, tmp3 = multitest.multipletests(p_val, method='fdr_bh') p_adj = [p for p in p_adj] elif correction=='Bonferroni': p_adj = [p*len(p_val) for p in p_val] else: print 'Unknown method for multiple hypothesis correction:' print correction print 'Exiting' exit(1) return(props, odds_r, n_r, p_val, p_adj) else: raise ValueError('\'%s\' is not yet implemented' % method)
def get_corrected_pvalues(self, pvalues, method=None): """Return corrected pvalues :param list pvalues: list or array of pvalues to correct. :param method: use the one defined in the constructor by default but can be overwritten here """ if method is not None: self.method = method pvalues = np.array(pvalues) if self.method == 'qvalue': qv = QValue(pvalues) corrections = qv.qvalue() return corrections else: corrections = multitest.multipletests(pvalues, alpha=self.alpha, method=self.method)[1] return corrections
def getPValues(feat_mean, strain_list, feat_list): strain_groups = feat_mean.groupby('Strain'); features_N2 = strain_groups.get_group('N2'); pvalue_table = pd.DataFrame(np.nan, index = feat_list, columns = strain_list, dtype = np.float64) for strain in pvalue_table.columns.values: features_S = strain_groups.get_group(strain); for feat in pvalue_table.index.values: x, y = features_N2[feat].values, features_S[feat].values dd, p_value = ttest_ind(x,y, equal_var=False) #dd, p_value = ranksums(x,y) #p_value positive if N2 is larger than the strain pvalue_table.loc[feat, strain] = p_value good = ~np.isnan(pvalue_table[strain]) #correct for false discovery rate using 2-stage Benjamini-Krieger-Yekutieli reject, pvals_corrected, alphacSidak, alphacBonf = \ smm.multipletests(pvalue_table.loc[good,strain].values, method = 'fdr_tsbky') pvalue_table.loc[good,strain] = pvals_corrected return pvalue_table
def deg_stat(data, classes, pos, neg, adjust='fdr_bh'): ''' Basic t-test for certain normalized DataFrame If its a RNA SEQ data, use READemption for data process is a better option :param data: the pandas dataframe :param classes: the class vector :param pos: the positive class name :param neg: the negative class name :param adjust: the multipletest adjust method :return: a dataframe contains the result of the basic ttest. ''' data = data.copy() PDF = data.groupby(classes, axis=1).get_group(pos) CDF = data.groupby(classes, axis=1).get_group(neg) ttests = [ttest_ind(PDF.iloc[i], CDF.iloc[i], equal_var=False)[1] for i in range(PDF.shape[0])] fc = PDF.mean(axis=1) - CDF.mean(axis=1) mul = multipletests(ttests, method=adjust) data['fold-change'] = pd.Series(fc, index=data.index) data['p-value'] = pd.Series(ttests, index=data.index) data['fdr'] = pd.Series(mul[1], index=data.index) return data
def multi_correct(data, noi_idx, meth='fdr_bh'): """ Run fdr correction on nodes of interest contained in an array of p values. Parameters: ----------- data : numpy array nnodes x nnodes array containing p values of correlation between each node noi_idx : numpy indices (applicable to both row and column) of nodes of interest. This reduces the number of nodes corrected for meth : str Method of correction. Options are: `bonferroni` : one-step correction `sidak` : on-step correction `holm-sidak` : `holm` : `simes-hochberg` : `hommel` : `fdr_bh` : Benjamini/Hochberg (default) `fdr_by` : Benjamini/Yekutieli Returns: ---------- fdr_corrected : numpy array nnodes x nnodes array containing p values corrected with fdr ( """ noi_data = data[np.ix_(noi_idx,noi_idx)] noi_upper = np.triu(noi_data, k=1) upper_rows, upper_cols = np.triu_indices_from(noi_data, k=1) masked_upper = noi_upper[np.ma.nonzero(noi_upper)].ravel() rej, corrp, alpha_sidak, alpha_bonnf = smm.multipletests(masked_upper, alpha=0.05, method=meth) fdr_corr_array = np.zeros((len(noi_idx),len(noi_idx))) for i in range(len(corrp)): fdr_corr_array[upper_rows[i],upper_cols[i]] = corrp[i] return fdr_corr_array + fdr_corr_array.T
def BH_correct(data,indx,thresh): pvals = [] d_exclude=[] names = [] vals = [] datums = [] for lines in data: if float(lines[indx]) > float(thresh): datum = lines datum = [float(num) if is_number(num) else num for num in datum] names.append(lines[0]) vals.append(float(lines[-1])) datums.append(datum) else: d_exclude.append([lines[0],"-10000"]) bhs= list(ssm.multipletests(vals,method="fdr_bh")[1]) for i in xrange(len(bhs)): #print datums[i] datums[i].append(bhs[i]) #print datums[i] datums = sorted(datums,key=itemgetter(-1,-2)) datums = datums + d_exclude return datums
def t_test_multi(result, contrasts, method='hs', alpha=0.05, ci_method=None, contrast_names=None): """perform t_test and add multiplicity correction to results dataframe Parameters ---------- result results instance results of an estimated model contrasts : ndarray restriction matrix for t_test method : string or list of strings method for multiple testing p-value correction, default is'hs'. alpha : float significance level for multiple testing reject decision. ci_method : None not used yet, will be for multiplicity corrected confidence intervals contrast_names : list of strings or None If contrast_names are provided, then they are used in the index of the returned dataframe, otherwise some generic default names are created. Returns ------- res_df : pandas DataFrame The dataframe contains the results of the t_test and additional columns for multiplicity corrected p-values and boolean indicator for whether the Null hypothesis is rejected. """ tt = result.t_test(contrasts) res_df = tt.summary_frame(xname=contrast_names) if type(method) is not list: method = [method] for meth in method: mt = multipletests(tt.pvalue, method=meth, alpha=alpha) res_df['pvalue-%s' % meth] = mt[1] res_df['reject-%s' % meth] = mt[0] return res_df
def close(self): output = pd.concat(self.output) output = output.sort_index() E.debug("most 3' coordingate seen is %s" % (output.index.values[-1],)) if self.correct: E.info("Correcting p-values using BH ...") corrected_pvals = multipletests(output, method="fdr_bh") output = pd.Series(corrected_pvals[1], index=output.index) E.info("Writing output") E.debug("output contains %i entries" % len(output)) if self.outfile_windows: E.info("Writing windows") sig_windows = output[output < self.threshold] for gene in self.genes: windows = bases_to_windows(sig_windows, gene, self.window_size, self.threshold) for bed in windows: self.outfile_windows.write(str(bed) + "\n") if self.outfile_bases: E.info("Writing bases") output = output.reset_index() output.drop("strand", axis=1, inplace=True) output.drop("gene_id", axis=1, inplace=True) output = output.groupby(["contig", "position"], as_index=False).min() output.position = output.position.astype("int64") output["end"] = output["position"] + 1 output = output[["contig", "position", "end", 0]] output.to_csv(self.outfile_bases, sep="\t", header=False, index=False)
def outlier_test(model_results, method='bonf', alpha=.05, labels=None, order=False, cutoff=None): """ Outlier Tests for RegressionResults instances. Parameters ---------- model_results : RegressionResults Linear model results method : str - `bonferroni` : one-step correction - `sidak` : one-step correction - `holm-sidak` : - `holm` : - `simes-hochberg` : - `hommel` : - `fdr_bh` : Benjamini/Hochberg - `fdr_by` : Benjamini/Yekutieli See `statsmodels.stats.multitest.multipletests` for details. alpha : float familywise error rate labels : None or array_like If `labels` is not None, then it will be used as index to the returned pandas DataFrame. See also Returns below order : bool Whether or not to order the results by the absolute value of the studentized residuals. If labels are provided they will also be sorted. cutoff : None or float in [0, 1] If cutoff is not None, then the return only includes observations with multiple testing corrected p-values strictly below the cutoff. The returned array or dataframe can be empty if there are no outlier candidates at the specified cutoff. Returns ------- table : ndarray or DataFrame Returns either an ndarray or a DataFrame if labels is not None. Will attempt to get labels from model_results if available. The columns are the Studentized residuals, the unadjusted p-value, and the corrected p-value according to method. Notes ----- The unadjusted p-value is stats.t.sf(abs(resid), df) where df = df_resid - 1. """ from scipy import stats # lazy import if labels is None: labels = getattr(model_results.model.data, 'row_labels', None) infl = getattr(model_results, 'get_influence', None) if infl is None: results = maybe_unwrap_results(model_results) raise AttributeError("model_results object %s does not have a " "get_influence " "method." % results.__class__.__name__) resid = infl().resid_studentized_external if order: idx = np.abs(resid).argsort()[::-1] resid = resid[idx] if labels is not None: labels = np.asarray(labels)[idx] df = model_results.df_resid - 1 unadj_p = stats.t.sf(np.abs(resid), df) * 2 adj_p = multipletests(unadj_p, alpha=alpha, method=method) data = np.c_[resid, unadj_p, adj_p[1]] if cutoff is not None: mask = data[:, -1] < cutoff data = data[mask] else: mask = slice(None) if labels is not None: from pandas import DataFrame return DataFrame(data, columns=['student_resid', 'unadj_p', method + "(p)"], index=np.asarray(labels)[mask]) return data
def html_report(outdir, infile, pwmfile, threshold=0.01): df = pd.read_table(infile, index_col=0) del df.index.name df["corrected P-value"] = multipletests(df["P-value"], method="fdr_bh")[1] cols = [ "Logo", "# matches", "# matches background", "P-value", "log10 P-value", "corrected P-value", "ROC AUC", "Enr. at 1% FPR", "Recall at 10% FDR" ] m2f = pwmfile.replace(".pwm", ".motif2factors.txt") if os.path.exists(m2f): sys.stderr.write("reading mapping\n") m2f = pd.read_table(m2f, index_col=0) m2f.columns = ["factors"] f = m2f["factors"].str.len() > 30 m2f["factors"] = '<div title="' + m2f["factors"] + '">' + m2f[ "factors"].str.slice(0, 30) m2f.loc[f, "factors"] += '(...)' m2f['factors'] += '</div>' df = df.join(m2f) cols = ["factors"] + cols df = df[df["corrected P-value"] <= threshold] df["Logo"] = [ '<img src="logos/{}.png" height=40/>'.format(x) for x in list(df.index) ] df = df[cols] if not os.path.exists(outdir + "/logos"): os.makedirs(outdir + "/logos") for motif in read_motifs(open(pwmfile)): if motif.id in df.index: motif.to_img(outdir + "/logos/{}.png".format(motif.id), fmt="PNG") bar_cols = [ "log10 P-value", "ROC AUC", "MNCP", "Enr. at 1% FDR", "Max enr.", "Recall at 10% FDR" ] template_dir = MotifConfig().get_template_dir() js = open(os.path.join(template_dir, "sortable/sortable.min.js"), encoding="utf-8").read() css = open(os.path.join(template_dir, "sortable/sortable-theme-slick.css"), encoding="utf-8").read() with open(outdir + "/gimme.roc.report.html", "w", encoding="utf-8") as f: f.write("<head>\n") f.write("<style>{}</style>\n".format(css)) f.write("</head>\n") f.write("<body>\n") if df.shape[0] > 0: f.write( df.sort_values( "ROC AUC", ascending=False).style.bar(bar_cols).set_precision(3). set_table_attributes("data-sortable").render().replace( "data-sortable", 'class="sortable-theme-slick" data-sortable')) else: f.write("No enriched motifs found.") f.write("<script>{}</script>\n".format(js)) f.write("</body>\n")
sample_num_l = map(int,sys.argv[2].split(",")) alpha = float(sys.argv[3]) partial_kruskal = partial(calc_kruskal, sample_num_l=sample_num_l, alpha=alpha) pool = Pool(processes=int(sys.argv[4])) result = pool.map(partial_kruskal,[row for row in reader]) p_val_list=[] for elem in result: p_val_list += [float(elem[-2])] rej, pval_corr = smm.multipletests(p_val_list, alpha=alpha, method=sys.argv[6])[:2] for index in range(len(result)): result[index] = result[index] + [`pval_corr[index]`] with open(sys.argv[5], 'w') as f_out: f_out.write(header_line) f_out.writelines('\t'.join(i) + '\n' for i in result) # with open(sys.argv[5], 'r') as correc: # correc_reader = csv.reader(correc, delimiter="\t") # correc_header_line = next(correc) # correc_header_line = correc_header_line.rstrip() + '\tp.adj' # # p_val_list=[]
# Differential protein abundance comparisons_fc = [] for k, v in comparisons.items(): df = pd.DataFrame( ttest_ind( prot[v["control"]].T, prot[v["condition"]].T, equal_var=False, nan_policy="omit", ), index=["tstat", "pvalue"], columns=prot.index, ).T.astype(float).sort_values("pvalue").dropna() df["comparison"] = k df["fdr"] = multipletests(df["pvalue"], method="fdr_bh")[1] df["diff"] = prot.loc[df.index, v["control"]].median(1) - prot.loc[ df.index, v["condition"]].mean(1) comparisons_fc.append(df.reset_index()) comparisons_fc = pd.concat(comparisons_fc).sort_values("fdr") comparisons_diff = pd.pivot_table(comparisons_fc, index="GeneSymbol", columns="comparison", values="diff") comparisons_fc.to_csv(f"{DPATH}/perturbation_proteomics_diff_analysis.csv", index=False) # Plot distribtuions fig, ax = plt.subplots(1, 1, figsize=(2, 1), dpi=600)
def run_experiment(depth, cutoff, out_folder, expression_path, categories_path, id_names_path, col_names, phase_2_index, alter_id=True, only_save=True, total_count_all=True, sig_p=0.05): """ Performs enrichment analysis og gene functions. The analysis is performed with a hypergeometric test, the multiple testing coreccbenjamini hochberg correction. :param depth: The depth of the functions(catgeroies) used :param cutoff: Cutoff value for expression :param out_folder: Path to output folder :param expression_path: Path to expression values :param categories_path: Path to mappings of genes to functions :param id_names_path: Path to names and id-s :param col_names: Names of columns used in the id_names_path file :param phase_2_index: Indexes of phases in the expression value file :param alter_id: Alternative id :param only_save: If true values are stored to out directory, heatmaps are not plotted :param total_count_all: If true the number of successes per function in population is calculated from all genes, else only from expressed :param sig_p: The p-value considered as significant :return: """ gene_2_profile = import_profiles(expression_path) gene_2_cat = import_gene_2_categories(categories_path, depth) orig_2_alter = import_mappings(id_names_path, col_names, alter_id) """ Get expressed genes by phases. """ phase_2_genes = {} for phase in phase_2_index: phase_2_genes[phase] = [] for gene in gene_2_profile: profile = gene_2_profile[gene] for phase in phase_2_index: if is_expressed(profile, phase_2_index[phase], cutoff): phase_2_genes[phase].append(gene) cluster_sample_count = [] cat_2_tot_count = {} total_count = 0 clus_2_res = {} all_cats = set() no_annot_gene_count = 0 annotation_out_path = os.path.join( out_folder, "annotations_phase_cutoff_" + str(cutoff).replace(".", "_") + "_depth_" + str(depth) + ".txt") with open(annotation_out_path, "w") as ann_out: for phase in phase_2_genes: print("Analyzing phase " + phase) ann_out.write("\n") ann_out.write(phase + "\n") clus_2_count_sample = {} sample_count = 0 for gene_id in phase_2_genes[phase]: if gene_id in orig_2_alter: alter_gene_id = orig_2_alter[gene_id] else: print("No orig -> alter mapping for " + gene_id) continue if alter_gene_id in gene_2_cat: categories = gene_2_cat[alter_gene_id] for cat in categories: if cat not in clus_2_count_sample: clus_2_count_sample[cat] = 0 if cat not in cat_2_tot_count: cat_2_tot_count[cat] = 0 clus_2_count_sample[cat] += 1 sample_count += 1 ann_out.write(gene_id + "###" + cat + "\n") all_cats.add(cat) cat_2_tot_count[cat] += 1 total_count += 1 else: no_annot_gene_count += 1 cat = "no_annotation" if cat not in clus_2_count_sample: clus_2_count_sample[cat] = 0 if cat not in cat_2_tot_count: cat_2_tot_count[cat] = 0 clus_2_count_sample[cat] += 1 sample_count += 1 ann_out.write(gene_id + "###" + cat + "\n") all_cats.add(cat) cat_2_tot_count[cat] += 1 total_count += 1 clus_2_res[phase] = clus_2_count_sample.copy() cluster_sample_count.append(sample_count) if total_count_all: cat_2_tot_count, total_count = get_total_function_counts( gene_2_profile, orig_2_alter, gene_2_cat, out_folder) print("Total number of different categories: " + str(len(all_cats))) p_values = [] total_sample_count = sum(cluster_sample_count) if total_sample_count != total_count and not total_count_all: raise Exception("Total count and total sample count must be equal!!!") else: print("OK") cluster_counter = 0 for c in clus_2_res: print("Calculating p-values sample: " + str(cluster_counter)) for cat in clus_2_res[c]: pval = hypergeometric_over(clus_2_res[c][cat], cluster_sample_count[cluster_counter], cat_2_tot_count[cat], total_count) p_values.append(pval) cluster_counter += 1 # benjamini hochberg correction p_adjusted = multi.multipletests(p_values, method="fdr_bh")[1] p_counter = 0 cluster_counter = 0 res_out_path = os.path.join(out_folder, "hyper_cutoff_" + str(cutoff).replace(".", "_") \ + "_depth_" + str(depth) + ".txt") res_out_path_filtered = os.path.join(out_folder, "hyper_filtered_cutoff_" + str(cutoff).replace(".", "_") \ + "_depth_" + str(depth) + ".txt") with open(res_out_path, "w") as out: with open(res_out_path_filtered, "w") as out_filter: for c in clus_2_res: print("Storing sample " + str(cluster_counter)) out.write( c + "\tquant\tsample\thit\ttotal\tp_value\tp_adj\tlog_odds" "\n") out_filter.write( c + "\tquant\tsample\thit\ttotal\tp_value\tp_adj\tlog_odds" "\n") sample_count = cluster_sample_count[cluster_counter] temp_out = [] for cat in clus_2_res[c]: quant = clus_2_res[c][cat] sample = sample_count hit = cat_2_tot_count[cat] total = total_count out_line = cat + "\t" + str(quant) + "\t" + str(sample) + "\t" \ + str(hit) + "\t" + str(total) + "\t" + \ str(p_values[p_counter]) + "\t" + str(p_adjusted[p_counter]) if quant != 0 and (sample - quant) != 0 and ( total - hit - sample + quant) != 0 and (hit - quant) != 0: odds_sample = quant / (sample - quant) odds_rest = (hit - quant) / (total - hit - sample + quant) real_log_odds = math.log2(odds_sample / odds_rest) out_line += "\t" + "%.2f" % real_log_odds temp_out.append((p_adjusted[p_counter], out_line)) else: out_line += "\tnan" temp_out.append((p_adjusted[p_counter], out_line)) p_counter += 1 temp_out.sort() cluster_counter += 1 for t in temp_out: out.write(t[1] + "\n") if t[0] < sig_p: out_filter.write(t[1] + "\n") out.write("\n") out_filter.write("\n") # set paths table_path_log = os.path.join(out_folder, "log_odds_cutoff_" + str(cutoff).replace(".", "_") + "_depth_" \ + str(depth) + ".xlsx" ) table_p_values_path = os.path.join(out_folder, "p_adj_values_cutoff_" + str(cutoff).replace(".", "_") + "_depth_" \ + str(depth) + ".xlsx") gene_sig_path = os.path.join(out_folder, "genes_functions_cutoff_" + str(cutoff).replace(".", "_") + "_depth_" \ + str(depth) + ".xlsx") plot_store_heatmap(res_out_path, table_path_log, p_adj=False, only_save=only_save, sig_p=sig_p) plot_store_heatmap(res_out_path, table_p_values_path, p_adj=True, only_save=only_save, sig_p=sig_p) store_genes_with_significant_functions(annotation_out_path, table_p_values_path, id_names_path, col_names, gene_sig_path, alter_id)
# Generate QQ plot for p-values fig, ax = plt.subplots() ax.scatter(p_values_df["uniform_logP"], p_values_df["log_p_values"]) ax.plot([8, 0], [8, 0], color="black") ax.set_title("QQ Plot") ax.set_xlabel("Expected -log10(p-value)") ax.set_ylabel("Observed -log10(p-value)") fig.savefig("qq_plot.png") # Identify transcripts that are differential expressed at a 10% false discovery rate p_values_df["fdr_0.10"] = multitest.multipletests(p_values_df["p_values"], method="fdr_bh", alpha=0.10)[0] # Write these transcripts to an output file p_values_df["Transcript"][p_values_df["fdr_0.10"]].to_csv( "diff_expression.txt", index=False) # Repeat analysis, but with sex as a covariate p_values_cov = [] for transcript in fpkm_reformat["t_name"].unique(): # Get all expression data for one transcript transcript_data = fpkm_reformat[fpkm_reformat["t_name"] == transcript] # Use OLS to test if transcript is differentially expressed across stages while controlling for sex
def melt_upper_triangle(df_, val_str): dfnan = df_.where(np.triu(np.ones(df_.shape)).astype(np.bool)) melted_df = dfnan.stack().reset_index() melted_df.columns = ['OTU_1', 'OTU_2', val_str] melted_df2 = melted_df[melted_df['OTU_1'] != melted_df['OTU_2']] return melted_df2.set_index(['OTU_1', 'OTU_2']) mpdf = melt_upper_triangle(p_df, 'p-value') mdf = melt_upper_triangle(df, 'correlation') fulldf = mdf.join(mpdf) # pull total abundances # pull taxonomy (order?) reject, pvals_corrected = multipletests(fulldf['p-value'].values, alpha=0.05, method='fdr_bh')[:2] thresholded = fulldf.loc[fulldf.index[reject], ['correlation']].reset_index() corr_cutoff = abs(thresholded.correlation) > 0.5 thresholded_cutoff = thresholded[corr_cutoff] thresholded_cutoff.to_csv( "/Volumes/KeithSSD/CB_V4/otu_data/sparcc_data/test_correlations.txt", sep="\t", index=False)
hemi)) compare_dict = CsvReader(compare_file).to_dict(1) valid_idx_mat = np.array(compare_dict['p']) != 'nan' if mask_file is not None: mask_vertices = nib.freesurfer.read_label(mask_file) mask_idx_mat = np.zeros_like(valid_idx_mat, dtype=np.bool) mask_idx_mat[mask_vertices] = True valid_idx_mat = np.logical_and(valid_idx_mat, mask_idx_mat) compare_data = np.zeros((3, maps.shape[1])) ps_uncorrected = np.array([ float(p) for idx, p in enumerate(compare_dict['p']) if valid_idx_mat[idx] ]) reject, ps_corrected, alpha_sidak, alpha_bonf = multipletests( ps_uncorrected, 0.05, 'fdr_bh') ts = [ float(t) for idx, t in enumerate(compare_dict['t']) if valid_idx_mat[idx] ] compare_data[0, valid_idx_mat] = ts compare_data[1, valid_idx_mat] = -ps_uncorrected compare_data[2, valid_idx_mat] = -ps_corrected compare_data[0, np.logical_not(valid_idx_mat)] = np.min(ts) compare_data[1, np.logical_not(valid_idx_mat)] = np.min(-ps_uncorrected) compare_data[2, np.logical_not(valid_idx_mat)] = np.min(-ps_corrected) save2nifti( pjoin(compare_dir, '{}_g1_vs_g2_posterior_masked.nii.gz'.format(hemi)), compare_data) # ---compare2nifti end---
else: sig_p.append(np.nan) log_pvals = -(np.log10(p_vals)) cor_alpha = 0.05 / 90 cor_alphalog = -(np.log10(cor_alpha)) ax, fig = plt.subplots() plt.plot(log_pvals) plt.xlabel('Samples') plt.ylabel('-log(p)') plt.hlines(cor_alphalog, 0, 90, color='red') import statsmodels.stats.multitest as sm bools, p_adj, x, x2 = sm.multipletests(p_vals, method='bonferroni') # Use MNE cluster permutation X_input = [face_5chs, scene_5chs] X_3Dinput = [dat_5chs, dat_5chs_face] Fobs, clusters, clusters_pval, H0 = mne.stats.permutation_cluster_test( X_3Dinput) Fobs1, clusters1, clusters_pval1, H01 = mne.stats.permutation_cluster_test( X_input, n_permutations=10) plt.plot(clusters1) #%% # Get evoked scene_evoked_allSubs = MNEevoked_scene._data
def compute_fdr_by_dist(d): fdrs = multipletests(list(d['pvalue']), method='fdr_bh')[1] d.loc[:, 'fdr_dist'] = fdrs return d
def simple_auto_stationarize(df, verbosity=None, alpha=None, multitest=None, get_conclusions=False, get_actions=False): """Auto-stationarize the given time-series dataframe. Parameters ---------- df : pandas.DataFrame A dataframe composed solely of numeric columns. verbosity : int, logging.Logger, optional If an int is given, it is interpreted as the logging lever to use. See https://docs.python.org/3/library/logging.html#levels for details. If a logging.Logger object is given, it is used for printing instead, with appropriate logging levels. If no value is provided, the default logging.Logger behaviour is used. alpha : int, optional Family-wise error rate (FWER) or false discovery rate (FDR), depending on the method used for multiple hypothesis testing error control. If no value is provided, a default value of 0.05 (5%) is used. multitest : str, optional The multiple hypothesis testing eror control method to use. If no value is provided, the Benjamini–Yekutieli is used. See `the documesimple_auto_stationarizentation of statsmodels' multipletests method for supported values <https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html>`. get_conclusions : bool, defaults to False If set to true, a conclusions dict is returned. get_actions : bool, defaults to False If set to true, an actions dict is returned. Returns ------- results : pandas.DataFrame or dict By default, only he transformed dataframe is returned. However, if get_conclusions or get_actions are set to True, a dict is returned instead, with the following mappings: - `postdf` - Maps to the transformed dataframe. - `conclusions` - Maps to a dict mapping each column name to the arrived conclusion regarding its stationarity. - `actions` - Maps to a dict mapping each column name to the transformations performed on it to stationarize it. """ # noqa: E501 if verbosity is not None: prev_verbosity = set_verbosity_level(verbosity) if alpha is None: alpha = DEF_ALPHA logger = get_logger() logger.info("Starting to auto-stationarize a dataframe!") logger.info("Starting to check input data validity...") logger.info(f"Data shape (time, variables) is {df.shape}.") # the first axis - rows - is expected to represent the time dimension, # while the second axis - columns - is expected to represent variables; # thus, the first expected to be much longer than the second logger.info( "Checking current data orientation (rows=time, columns=variables)...") if df.shape[1] >= df.shape[0]: logger.warning(( "stationarizer's input dataframe has more columns than rows! " "Columns are expected to represent variables, while rows represent" " time steps, and thus the input dataframe is expected to have " "more rows than columns. Either the input data is inverted, or the" " data has far more variables than samples.")) else: logger.info("Data orientation is valid.") # assert all columns are numeric all_cols_numeric = all([np.issubdtype(x, np.number) for x in df.dtypes]) if not all_cols_numeric: err = ValueError( "All columns of stationarizer's input dataframe must be numeric!") logger.exception(err) # util var n = len(df.columns) # testing for unit root logger.info( ("Checking for the presence of a unit root in the input time series " "using the Augmented Dicky-Fuller test")) logger.info( ("Reminder:\n " "Null Hypothesis: The series has a unit root (value of a=1); meaning," " it is NOT stationary.\n" "Alternate Hypothesis: The series has no unit root; it is either " "stationary or non-stationary of a different model than unit root.")) adf_results = [] for colname in df.columns: srs = df[colname] result = adfuller(srs, regression='ct') logger.info( (f"{colname}: test statistic={result[0]}, p-val={result[1]}.")) adf_results.append(result) # testing for trend stationarity logger.info(( "Testing for trend stationarity of input series using the KPSS test.")) logger.info(("Reminder:\n" "Null Hypothesis (H0): The series is trend-stationarity.\n" "Alternative Hypothesis (H1): The series has a unit root.")) kpss_results = [] for colname in df.columns: srs = df[colname] result = kpss(srs, regression='ct') logger.info( (f"{colname}: test statistic={result[0]}, p-val={result[1]}.")) kpss_results.append(result) # Controling FDR logger.info( ("Controling the False Discovery Rate (FDR) using the Benjamini-" f"Yekutieli procedure with α={DEF_ALPHA}.")) adf_pvals = [x[1] for x in adf_results] kpss_pvals = [x[1] for x in kpss_results] pvals = adf_pvals + kpss_pvals by_res = multipletests( pvals=pvals, alpha=alpha, method='fdr_by', is_sorted=False, ) reject = by_res[0] corrected_pvals = by_res[1] adf_rejections = reject[:n] kpss_rejections = reject[n:] adf_corrected_pvals = corrected_pvals[:n] # noqa: F841 kpss_corrected_pvals = corrected_pvals[n:] # noqa: F841 conclusion_counts = {} def dict_inc(dicti, key): try: dicti[key] += 1 except KeyError: dicti[key] = 1 # interpret results logger.info("Interpreting test results after FDR control...") conclusions = {} actions = {} for i, colname in enumerate(df.columns): conclusion = conclude_adf_and_kpss_results( adf_reject=adf_rejections[i], kpss_reject=kpss_rejections[i]) dict_inc(conclusion_counts, conclusion) trans = CONCLUSION_TO_TRANSFORMATIONS[conclusion] conclusions[colname] = conclusion actions[colname] = trans logger.info((f"--{colname}--\n " f"ADF corrected p-val: {adf_corrected_pvals[i]}, " f"H0 rejected: {adf_rejections[i]}.\n" f"KPSS corrected p-val: {kpss_corrected_pvals[i]}, " f"H0 rejected: {kpss_rejections[i]}.\n" f"Conclusion: {conclusion}\n Transformations: {trans}.")) # making non-stationary series stationary! post_cols = {} logger.info("Applying transformations...") for colname in df.columns: srs = df[colname] if Transformation.DETREND in actions[colname]: logger.info(f"Detrending {colname} (len={len(srs)}).") srs = detrend(srs, order=1, axis=0) if Transformation.DIFFRENTIATE in actions[colname]: logger.info(f"Diffrentiating {colname} (len={len(srs)}).") srs = diff(srs, k_diff=1) post_cols[colname] = srs logger.info(f"{colname} transformed (len={len(post_cols[colname])}).") # equalizing lengths min_len = min([len(post_cols[x]) for x in post_cols]) for colname in df.columns: post_cols[colname] = post_cols[colname][:min_len] postdf = df.copy() postdf = postdf.iloc[:min_len] for colname in df.columns: postdf[colname] = post_cols[colname] logger.info(f"Post transformation shape: {postdf.shape}") for k in conclusion_counts: count = conclusion_counts[k] ratio = 100 * (count / len(df.columns)) logger.info(f"{count} series ({ratio}%) found with conclusion: {k}.") if verbosity is not None: set_verbosity_level(prev_verbosity) if not get_actions and not get_conclusions: return postdf results = {'postdf': postdf} if get_conclusions: results['conclusions'] = conclusions if get_actions: results['actions'] = actions return results
def getCorrectedPValues(pval_raw,alpha=0.05,method='fdr_i'): rej, pval_corr = smm.multipletests(pval_raw, alpha=alpha, method=method)[:2] return pval_corr
return gsea_dat ### Thresholding HIT = 10 LOWER = 20 UPPER = 500 alpha = 0.05 topGeneN = int(sys.argv[1]) mf_genes_SI = readin_gsea_result_SI('topGene%d' % topGeneN,HIT, lower=LOWER, higher=UPPER) mf_genes_SI['BH_p'] = multitest.multipletests(mf_genes_SI['pvalue'], method = 'fdr_bh')[1] print(len(set(mf_genes_SI[mf_genes_SI['BH_p'] < alpha].index))) x = mf_genes_SI[mf_genes_SI['group'] != 0] print(len(set(x[x['BH_p'] < alpha].index))) suffix = '%s_stringent' % FMfn mf_genes_SI.to_csv('/work-zfs/abattle4/heyuan/tissue_spec_eQTL_v8/plots/Fig3_GSEA_%s.txt' % suffix, sep='\t', index=True) mf_genes_SI = readin_gsea_result_SI('topGene30',HIT, lower=LOWER, higher=UPPER) mf_genes_SI['BH_p'] = multitest.multipletests(mf_genes_SI['pvalue'], method = 'fdr_bh')[1] print(len(set(mf_genes_SI[mf_genes_SI['BH_p'] < alpha].index))) x = mf_genes_SI[mf_genes_SI['group'] != 0]
def rep_compare(valueD, total_first, total_second, method, log2, scale, log2_already): resultL = [] correctL = [] pList = [] no_correctL = [] for id, valueL in valueD.items(): tmpL = [id] #meanL = [id] if log2: v1 = [ log((float(i) + 1) * scale / total_first, 2) for i in valueL[0] ] else: v1 = [float(i) * scale / total_first for i in valueL[0]] tmpL.extend(v1) len_v1 = len(v1) meanV1 = sum(v1) / len_v1 if log2: v2 = [ log((float(i) + 1) * scale / total_second, 2) for i in valueL[1] ] else: v2 = [float(i) * scale / total_second for i in valueL[1]] len_v2 = len(v2) meanV2 = sum(v2) / len_v2 tmpL.extend(v2) tmpL.append(meanV1) tmpL.append(meanV2) #meanL.append() if meanV1 * meanV2 == 0: meanV1 += 1 meanV2 += 1 if log2_already: diff = meanV2 - meanV1 else: diff = log(meanV2 / meanV1, 2) #if log2: # v1 = [log(i+1, 2) for i in v1] # v2 = [log(i+1, 2) for i in v2] tmpL.append(diff) if abs(diff) >= 0.2: p = stat_pvalue(v1, v2, method) else: p = 0.5 tmpL.append(p) if abs(diff) >= 0.2 and p < 0.2: tmpL.append(1) correctL.append(tmpL) pList.append(p) else: tmpL.append(1) #no_correctL.append(tmpL) if pList: p_adjL = multipletests(pList, method="fdr_bh")[1] for tmpL, p_adj in zip(correctL, p_adjL): tmpL[-1] = p_adj resultL = correctL[:] #resultL.extend(no_correctL) resultL.sort(key=lambda x: x[-1]) return resultL
def main(argv=sys.argv): parser = argparse.ArgumentParser(description='MODriver v1.0') parser.add_argument("-c", dest='coding', default="./coding_key.csv", help="coding file") parser.add_argument("-n", dest='non_coding', default="./non_coding_key.csv", help="non_coding file") parser.add_argument("-s", dest='pos', default="./pos_2018.txt", help="coding file") parser.add_argument("-g", dest='neg', default="./neg_2018.txt", help="non_coding file") parser.add_argument("-m", dest='mode', default="sort", help="mode") parser.add_argument("-l", dest='learn', default="MODNN", help="mode") parser.add_argument("-t", dest='type', default="Pancan", help="cancer type") parser.add_argument("-o", dest='out', default="./score/", help="coding file") parser.add_argument("-p", dest='threads_num', type=int, default=1, help="threads num") args = parser.parse_args() df_tmp = pd.read_csv('./chr_id.txt', header=0, index_col=3, sep='\t', usecols=[0, 1, 2, 3]) all_list = df_tmp.index.tolist() key_2018 = './key_2018.txt' # if args.type != 'Pancan': # key_2018 = "./input/%s.key" % args.type pd_key = pd.read_csv(key_2018, header=None, sep='\t') pd_neg = pd.read_csv('./neg_2018.txt', header=None, sep='\t') pd_neg.columns = ['gene'] pd_key.columns = ['gene', 'type'] pd_key = pd_key.drop_duplicates(subset=['gene'], keep='first') pd_neg = pd_neg.drop_duplicates(subset=['gene'], keep='first') key_18 = pd_key['gene'].values.tolist() neg_18 = pd_neg['gene'].values.tolist() known_key = ['TERT'] neg_key = ['CACNA1E', 'COL11A1', 'DST', 'TTN'] key_18 = list(set(key_18) | set(known_key)) #neg_key = list(set(neg_18) | set(neg_key)) pos, neg = build_set(key_18, neg_key, all_list, nb_imb=20) # pos, neg = pickle.load(open('pos.neg', 'rb')) X_train, y_train, X, X_sim, ids = file2data(args.type, pos, neg) print(X_train[0].shape[0], X[0].shape[0], X_sim[0].shape[0]) if args.mode == 'train': fit(X_train, y_train, args.type, method=args.learn) elif args.mode == 'gen_bed': input = 'PCAWG_test_genomic_elements.bed12.gz' out = 'chr_id.bed' df = pd.read_csv(input, header=None, sep='\t', usecols=[0, 1, 2, 3]) df.columns = ['chr', 'start', 'end', 'id'] df.index = df['id'] ids = df.loc[::, 'id'].values.tolist() ban_list = [ '::TTN::', '::DST::', '::DMD::', '::CACNA1E::', '::COL11A1::', '::mitranscriptome::' ] ids_new = [] for id in ids: b_keep = True for ban in ban_list: if ban in id: b_keep = False if b_keep: ids_new.append(id) df = df.loc[ids_new, ::] df['chr'] = df['chr'].apply(lambda x: str(x).replace("chr", "")) df = df.sort_values(by=['chr', 'start'], ascending=[True, True]) df.to_csv(out, header=False, index=False, sep='\t') if args.mode == 'neg': apps = [ '2020plus', 'ActiveDriver', 'CompositeDriver', 'MuSiC', 'MutSig2CV', 'OncodriveCLUST', 'OncodriveFML', 'e-Driver' ] nb_line = 0 for app in apps: nb_line += 1 thr = 0.6 path = '../coding/%s/PANCAN.txt' % app df = pd.read_csv(path, header=0, sep='\t', index_col=0, usecols=['gene', 'qvalue']) df = df[df['qvalue'] > thr] if nb_line == 1: neg_list = set(df.index.tolist()) else: neg_list = neg_list & set(df.index.tolist()) neg = list(neg_list) df = pd.DataFrame(data=neg, index=None, columns=['gene']) out = './neg_2018.txt' df.to_csv(out, header=False, index=False, sep='\t') elif args.mode == 'cv': fit_cv(X_train, y_train, 10, args.learn, False) elif args.mode == 'score': y_p = predict(X, args.type, method=args.learn) null_dist_path = '%s%s.null' % (args.out, args.type) f = open(null_dist_path, 'rb') null_dist = pickle.load(f) f.close() df_all = pd.DataFrame(data=y_p, index=ids, columns=['score']) ge_type = {} for id in ids: tmp = re.split('::', id)[0] tmp = str(tmp).replace("gc19_pc.", "") if tmp not in ge_type: ge_type[tmp] = [id] else: ge_type[tmp].append(id) nb_coding_drivers = 0 nb_noncoding_drivers = 0 dfs = [] for key in ge_type.keys(): df_score = df_all.loc[ge_type[key], ::] out_path = '%s%s.%s.score' % (args.out, args.type, key) pvals = 1 - null_dist(df_score['score'].values.tolist()) df_score['p'] = pvals p_min = 1e-6 df_score.loc[df_score['p'] < p_min, 'p'] = p_min _, qvals, _, _ = mt.multipletests(pvals=pvals, alpha=0.1, method='fdr_bh') df_score['q'] = qvals df_show = df_score[df_score['q'] < 0.1] dfs.append(df_show) if key == 'cds': nb_coding_drivers += df_show.shape[0] else: nb_noncoding_drivers += df_show.shape[0] df_score = df_score.sort_values(by=['score'], ascending=[False]) df_score.to_csv(out_path, header=True) out_path = "%s%s.%s.score" % ("./", args.type, 'all') df = pd.concat(dfs, axis=0) df = df.sort_values(by=['score'], ascending=[False]) df.to_csv(out_path, header=True) print(nb_coding_drivers + nb_noncoding_drivers, nb_coding_drivers, nb_noncoding_drivers) elif args.mode == 'null': y_sim = predict(X_sim, args.type, method=args.learn, b_null=True) df_sim = pd.DataFrame(data=y_sim, columns=['score']) out_path = '%s%s.null' % (args.out, args.type) null_dist = sm.distributions.ECDF(df_sim['score'].values.tolist()) fp = open(out_path, 'wb') pickle.dump(null_dist, fp) fp.close() elif args.mode == 'simulation': tmp_dir = '/data/tmp/' random_out_file = 'simulation.txt.gz' # based on the ori maf file ori_input = '../data/ICGC/final_consensus_passonly.snv_mnv_indel.icgc.public.maf.gz' col0 = [ 'Chromosome', 'Start_position', 'End_position', 'Reference_Allele', 'Tumor_Seq_Allele2', 'Tumor_Sample_Barcode', 'Matched_Norm_Sample_Barcode' ] promoter_set = ['TERT', 'MALAT1', 'NEAT1'] df = pd.read_csv(ori_input, header=0, sep='\t', usecols=col0 + ['Hugo_Symbol']) # remove the mutations in the TERT promoter, MALAT1, or NEAT1 df_anno = df.loc[~df['Hugo_Symbol'].isin(promoter_set), col0] all_input_file = '%s/all_input.txt' % tmp_dir all_out_file = '%s/all_out.txt' % tmp_dir df_anno.to_csv(all_input_file, header=False, index=False, sep='\t') cmd = "python parallel_do.py -c 'python simulation.py -i %s -o %s' -t %d --r" % ( all_input_file, all_out_file, args.threads_num) # cmd = 'python simulation.py -i %s -o %s' % (all_input_file, all_out_file) print(cmd) check_output(cmd, shell=True) df = pd.read_csv(all_out_file, header=None, sep='\t') df.columns = [ 'Chromosome', 'Start_position', 'End_position', 'Variant_Classification', 'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele2', 'Tumor_Sample_Barcode', 'Matched_Norm_Sample_Barcode', 'gc_content' ] df.to_csv(random_out_file, header=True, index=False, sep='\t', compression='gzip', float_format='%.3f') print("random mutations: " + str(df.shape[0]))
def resampleAllGo(go_term_groups, goi, go_terms, essential_count, non_essential_count, n=10000, save_intermediate=True): print('...resampling all GO terms...') #split go terms into essential and nonessential, and if in goi go_terms['essential'] = go_terms.index.isin(essential_genes) go_terms['in_goi'] = go_terms.index.isin(goi) essential_go_df = go_terms[go_terms['essential'] == True] non_essential_go_df = go_terms[go_terms['essential'] == False] goi_go_df = go_terms[go_terms['in_goi'] == True] print('- saving intermediate?: ' + str(save_intermediate)) if save_intermediate == True: goi_go_df.to_csv('goi_go_df.tsv', sep='\t', index=True) #get all go terms represented by more than one gene in goi all_goi_go_terms = goi_go_df.go_term.tolist() print('- number of go terms among goi (including dup.): ' + str(len(all_goi_go_terms))) goi_go_terms = list(set(all_goi_go_terms)) print('- number of go terms among goi (without dup.): ' + str(len(goi_go_terms))) goi_go_dupes = [ item for item, count in collections.Counter(all_goi_go_terms).items() if count > 1 ] print('-number of duplicated go terms among goi): ' + str(len(goi_go_dupes))) #print('terms: \n',goi_go_dupes) ##count go terms in goi goi_go_counts = goi_go_df.go_term.value_counts() ##print(goi_go_counts) ##get n random samples of the same # of essential and nonessential genes print('- number of random samples: ' + str(n)) samples = [] for i in range(n): random_sample = list( random.sample(essential_go_df.index.values.tolist(), essential_count)) random_sample += list( random.sample(non_essential_go_df.index.values.tolist(), non_essential_count)) go_terms['in_random'] = go_terms.index.isin(random_sample) samples.append(go_terms[go_terms['in_random'] == True]) print(samples[0]) go_terms = go_terms.drop(columns=['in_random']) ##count go terms in each random sample sample_counts = [] for sample in samples: sample_counts.append(sample.go_term.value_counts()) ##print('sample counts[:2]',sample_counts[:2]) ##resample each go term duplicated in the goi with all n samples and the goi: rv_dict = {} counter = 0 for term in goi_go_dupes: ##get number of that goi in go term n_go_goi = goi_go_counts[term] ##proceed if that number is above the minimum: if n_go_goi >= min_goi_count: ##get number of genes in each random sample in go term, compare to goi n_samples_greater_or_equal_to_goi = 0 counts_from_random_samples = [] for sample_count in sample_counts: try: n_go_sample = sample_count[term] except KeyError: n_go_sample = 0 ## if term=='GO:0005737': ## print(n_go_goi,n_go_sample) if n_go_sample >= n_go_goi: n_samples_greater_or_equal_to_goi += 1 counts_from_random_samples.append(n_go_sample) median_random = np.median(counts_from_random_samples) rv_dict[term] = [ n_go_goi, median_random, float(n_samples_greater_or_equal_to_goi) / float(n) ] #adjust for multiple hypothesis testing ## print(rv_dict['GO:0005737']) rv_df = pd.DataFrame.from_dict( rv_dict, orient='index', columns=['count_in_goi', 'median_count_in_random_sample', 'raw_rv']) pvalue_list = rv_df['raw_rv'].tolist() fdrbh_output = smm.multipletests( pvalue_list, method='fdr_bh') # benjamini hochberg method adjusted_pvalues = np.asarray(fdrbh_output[1].tolist()) rv_df['bh_rv'] = adjusted_pvalues #add counts to go_terms rv_df.reset_index(inplace=True) rv_df['GO_term_gene_count'] = rv_df.apply( lambda x: all_go_counts.loc[x['index'], 'count'], axis=1) rv_df.set_index('index', inplace=True) rv_df.sort_values(by=['bh_rv'], inplace=True) rv_df.to_csv('resample_v6_goi_go_df_repeatfiltered_min' + str(min_go_count) + 'max' + str(max_go_count) + '_mingoi' + str(min_goi_count) + 'onlyBioprocess' + str(onlyBioprocess) + '.tsv', sep='\t', index=True) # ## print(rv_df) return
def get_sign_pvals(self, alpha=0.1, min_present=5): '''Get FDR corrected p-values for rejecting the null hypothesis that the signs of the ratios originate from a p=0.5 binomial distribution. This test is used in order to identify features that increase/decrease significantly. For example, if the RatioExperiments is created for pre- and post-treatment samples of individuals (ratio is pre/post), get_sign_pvals can be used to identify features that significantly increase/decrease following the treatment. NOTE: The test is performed only on the non nan feature values. Parameters ---------- alpha: float, optional The required FDR control level min_present: int, optional The minimal number of samples where the ratio is not nan or zero in order to include in the test. Used as filtering to achieve better FDR power (less hypothesis to test) Returns ------- RatioExperiment Only features with higher than random number of positive or negative ratios. Features are sorted by the effect size (and by p-value for similar effect size). The feature_metadata contains 4 new fields: '__calour_stat', '_calour_pval', '_calour_qval', '_calour_direction' , similar to calour.analysis.diff_abundance(). ''' exp = self.copy() # need to convert to non-sparse in order to use np.isfinite() exp.sparse = False keep = [] pvals = np.ones(exp.shape[1]) esize = np.zeros(exp.shape[1]) npos = np.zeros(exp.shape[1]) nneg = np.zeros(exp.shape[1]) for idx in range(exp.shape[1]): cdat = exp.data[:, idx] cnpos = np.sum(cdat[np.isfinite(cdat)] > 0) cnneg = np.sum(cdat[np.isfinite(cdat)] < 0) npos[idx] = cnpos nneg[idx] = cnneg # test if we have enough non-zero samples if npos[idx] + nneg[idx] >= min_present: # calculate the binomial p-value and effect size for the feature pvals[idx] = scipy.stats.binom_test(cnpos, cnpos + cnneg) esize[idx] = (cnpos - cnneg) / (cnpos + cnneg) keep.append(idx) logger.debug('keeping %d features with enough ratios' % len(keep)) exp = exp.reorder(keep, axis='f') if len(keep) == 0: logger.warning('No significant features found') return exp pvals = pvals[keep] esize = esize[keep] # multiple testing correction using Benjamini-Hochberg FDR # note we cannot use dsFDR as this is not a 2 group test reject, qvals, *_ = multipletests(pvals, alpha=alpha, method='fdr_bh') newexp = _new_experiment_from_pvals(exp, None, reject, esize, pvals, qvals) # set the effect direction field newexp.feature_metadata[_CALOUR_DIRECTION] = [ 'positive' if x > 0 else 'negative' for x in newexp.feature_metadata[_CALOUR_STAT] ] logger.info('found %d significant' % len(newexp.feature_metadata)) return newexp
'elem_id': setElem, 'population_size': populationSize, 'success_population': numSuccInPopulation, 'sample_size': sampleSize, 'success_samples': drawnSuccesses, 'pval': pval, 'sample_success_fraction': fractionOfHitSamples, 'genes': ";".join(successIntersection), 'direction': direction } setToResult[setElem] = resultObj sortedElems = [x for x in setToResult] elemPvals = [setToResult[x]["pval"] for x in sortedElems] rej, elemAdjPvals, _, _ = multipletests(elemPvals, alpha=0.05, method='fdr_bh', is_sorted=False, returnsorted=False) for eidx, elem in enumerate(sortedElems): assert (setToResult[elem]['pval'] == elemPvals[eidx]) setToResult[elem]['adj_pval'] = elemAdjPvals[eidx] for elem in sortedElems: dr = DataRow.fromDict(setToResult[elem]) outdf.addRow(dr) outdf.export(args.output.name)
def plot_heatmaps(xs, ys, rhos, p_values, time): layout = go.Layout(margin=get_margin(), autosize=True, showlegend=False, yaxis=dict(type='category', showgrid=True, showline=True, mirror='ticks', titlefont=dict( family='Arial', color='black', size=2, ), showticklabels=True, tickangle=0, tickfont=dict(family='Arial', color='black', size=2), exponentformat='e', showexponent='all')) passed, p_values_corr, _, _ = multipletests(p_values.flatten(), 0.05, method='fdr_bh') passed.shape = (len(ys), len(xs)) passed = passed.astype(int) p_values_corr.shape = (len(ys), len(xs)) trace = go.Heatmap(z=rhos, x=xs, y=ys, colorscale=balance) fig = go.Figure(data=trace, layout=layout) plotly.offline.plot(fig, filename=out_path + '/rhos_' + time + '.html', auto_open=False, show_link=True) plotly.io.write_image(fig, out_path + '/rhos_' + time + '.png') plotly.io.write_image(fig, out_path + '/rhos_' + time + '.pdf') trace = go.Heatmap(z=-np.log10(p_values), x=xs, y=ys, colorscale=dense_inv) fig = go.Figure(data=trace, layout=layout) plotly.offline.plot(fig, filename=out_path + '/p_values_' + time + '.html', auto_open=False, show_link=True) plotly.io.write_image(fig, out_path + '/p_values_' + time + '.png') plotly.io.write_image(fig, out_path + '/p_values_' + time + '.pdf') trace = go.Heatmap(z=-np.log10(p_values_corr), x=xs, y=ys, colorscale=dense_inv) fig = go.Figure(data=trace, layout=layout) plotly.offline.plot(fig, filename=out_path + '/p_values_corr_' + time + '.html', auto_open=False, show_link=True) plotly.io.write_image(fig, out_path + '/p_values_corr_' + time + '.png') plotly.io.write_image(fig, out_path + '/p_values_corr_' + time + '.pdf') trace = go.Heatmap(z=passed, x=xs, y=ys) fig = go.Figure(data=trace, layout=layout) plotly.offline.plot(fig, filename=out_path + '/passed_' + time + '.html', auto_open=False, show_link=True) plotly.io.write_image(fig, out_path + '/passed_' + time + '.png') plotly.io.write_image(fig, out_path + '/passed_' + time + '.pdf')
def anova_oneway_simulation(data, variables, effect_size, sample_size, alpha=0.05, n_repeats=15, weight_values=None, weight_threshold=0.8, modification_type='correlation', class_balance=0.5, multiple_testing_correction='fdr_by'): """ Worker function to perform power calculations for a one-way ANOVA model, with effect size added parametrized using Cohen's d measure. :param numpy.ndarray data: X data matrix (real or simulated) to use in th :param int, float or numpy.ndarray variables: List of variables to modify. In case of an `int` value or numpy.ndarray with dtype=`int` only variable with If a single `Float` value is provided interpreted as a proportion will all be modified by their effect size :param numpy.ndarray effect_size: array with effect size values to test :param numpy.ndarray sample_size: array with sample sizes to test :param float alpha: :param int n_repeats: :param numpy.ndarray weight: Can be :param numpy.ndarray weight_threshold: Used in all modification methods invol :param str modification_type: How to mo. Single means only the variables requested are modified. Proportion means that a set of :param float class_balance: :return: """ try: import warnings warnings.filterwarnings('ignore') if modification_type not in [ 'correlation', 'manual', 'proportion', 'correlation_weighted' ]: raise ValueError("modification_type argument not supported") if modification_type == 'proportion' and not isinstance( variables, float): raise TypeError( "When using \'proportion\' as modification_type \'variables\' must be a float" ) # get the list of metrics calculated in scoreResults and update results = dict.fromkeys(score_metrics) for key in results.keys(): results[key] = np.zeros( (effect_size.size, sample_size.size, n_repeats)) if multiple_testing_correction is not None: adjusted_results = dict.fromkeys(score_metrics) for key in adjusted_results.keys(): adjusted_results[key] = np.zeros( (effect_size.size, sample_size.size, n_repeats)) adjusted_results['method'] = multiple_testing_correction n_vars = data.shape[1] # Loop over effect size, sample size and finally each monte carlo repeat for eff_idx, curr_effect in np.ndenumerate(effect_size): for ssize_idx, curr_ssize in np.ndenumerate(sample_size): for rep_idx in range(n_repeats): # Select samples to use ## Select a subset of the simulated spectra mod_data = np.copy(data[np.random.choice( data.shape[0], curr_ssize, replace=False), :]) # if any option other than proportion if modification_type != 'proportion': # Modify only variables above a certain threshold of correlation var_to_mod = np.zeros(n_vars, dtype='int') var_to_mod[variables] = 1 expected_hits = np.zeros(n_vars, dtype='int') expected_hits[var_to_mod == 1] = 1 # If correlation and correlation_weighted if weight_values is not None and modification_type in [ "correlation", "correlation_weighted" ]: if weight_values.ndim == 1: var_to_mod |= abs( weight_values) >= weight_threshold else: var_to_mod |= np.any( abs(weight_values) >= weight_threshold, axis=1) expected_hits = var_to_mod # Select a subset of samples to add the effect on which_samples = np.random.choice( range(curr_ssize), int(np.floor(class_balance * curr_ssize)), replace=False) if modification_type == 'correlation_weighted': mod_data = effect_cohen_d(mod_data, curr_effect, which_vars=var_to_mod, which_samples=which_samples, standardized=True, noise=0, weight=weight_values) else: mod_data = effect_cohen_d(mod_data, curr_effect, which_vars=var_to_mod, which_samples=which_samples, standardized=True, noise=0, weight=None) # Would it be possible to pass a model selection criteria? # P-values for the one-way ANOVA pvals = scistats.f_oneway( np.delete(mod_data, which_samples, axis=0), mod_data[which_samples, :])[1] if modification_type == 'correlation_weighted': scored_res = score_confusionmetrics( result_vector=pvals, expected_hits=expected_hits, weight_vector=weight_values, alpha=alpha) else: scored_res = score_confusionmetrics( result_vector=pvals, expected_hits=expected_hits, weight_vector=None, alpha=alpha) for key in scored_res.keys(): results[key][eff_idx, ssize_idx, rep_idx] = scored_res[key] # Would it be possible to pass a model selection criteria? # P-values for the one-way ANOVA if multiple_testing_correction is not None: adjusted_pvalues = multipletests( pvals, alpha=0.05, method=multiple_testing_correction)[1] scored_res = score_confusionmetrics( result_vector=adjusted_pvalues, expected_hits=expected_hits, weight_vector=None, alpha=alpha) for key in scored_res.keys(): adjusted_results[key][eff_idx, ssize_idx, rep_idx] = scored_res[key] results['Sample Size'] = sample_size results['Effect Size'] = effect_size if multiple_testing_correction is not None: adjusted_results['Sample Size'] = sample_size adjusted_results['Effect Size'] = effect_size # process the results... if multiple_testing_correction is None: return results else: return results, adjusted_results except TypeError as terp: raise terp except ValueError as verr: raise verr except Exception as exp: raise exp
def save_top_manova(config, attributes_types, attribute_target, num_top=500, window=3, test=MANOVATest.pillai_bartlett): dict_bop_cpgs = load_bop_cpg_dict(config) dict_bop_genes = get_dict_bop_genes(config, dict_bop_cpgs) cpgs, betas = load_cpg_data(config) atr_table = [] atr_cols = [] for atr_type in attributes_types: if isinstance(atr_type, Attribute): atr_table.append(get_attributes(config, atr_type)) elif isinstance(atr_type, CellPop): atr_table.append(get_cell_pop(config, [atr_type])) atr_cols.append(atr_type.value) num_bops = 0 bops_passed = [] bops_pvals = [] for bop in dict_bop_cpgs: curr_cpgs = dict_bop_cpgs.get(bop) cpgs_passed = [] for cpg in curr_cpgs: if cpg in cpgs: cpgs_passed.append(cpg) if len(cpgs_passed) > 2: pvals_on_bop = [] for win_id in range(0, len(cpgs_passed) - 2): val_table = [] val_cols = [] for cpg_id in range(0, window): cpg = cpgs_passed[win_id + cpg_id] beta = betas[cpgs.index(cpg)] val_table.append(beta) val_cols.append('cpg_'+str(cpg_id)) table = atr_table + val_table cols = atr_cols + val_cols formula = val_cols[0] for val_col_id in range(1, len(val_cols)): val_col = val_cols[val_col_id] formula += ' + ' + val_col formula += ' ~ ' + atr_cols[0] for atr_col_id in range(1, len(atr_cols)): atr_col = atr_cols[atr_col_id] formula += ' + ' + atr_col table = list(map(list, zip(*table))) x = pd.DataFrame(table, columns=cols) manova = MANOVA.from_formula(formula, x) mv_test_res = manova.mv_test() pvals = mv_test_res.results[attribute_target.value]['stat'].values[0:4, 4] target_pval = pvals[0] if test is MANOVATest.wilks: target_pval = pvals[0] elif test is MANOVATest.pillai_bartlett: target_pval = pvals[1] elif test is MANOVATest.lawley_hotelling: target_pval = pvals[2] elif test is MANOVATest.roy: target_pval = pvals[3] pvals_on_bop.append(target_pval) min_pval = np.min(pvals_on_bop) bops_passed.append(bop) bops_pvals.append(min_pval) num_bops += 1 if num_bops % config.print_rate == 0: print('num_bops: ' + str(num_bops)) reject, pvals_corrected, alphacSidak, alphacBonf = multipletests(bops_pvals, 0.05, method='fdr_bh') order = np.argsort(pvals_corrected) bops_opt = list(np.array(bops_passed)[order])[0:num_top] pvals_opt = list(np.array(pvals_corrected)[order])[0:num_top] genes_opt = [] genes_from_bop = [] for bop in bops_opt: curr_genes = dict_bop_genes.get(bop) genes_str = curr_genes[0] for gene_id in range(1, len(curr_genes)): genes_str += ';' + curr_genes[gene_id] genes_opt.append(genes_str) for gene in curr_genes: if gene not in genes_from_bop: genes_from_bop.append(gene) fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [bops_opt, genes_opt, pvals_opt]) config.approach_gd = GeneDataType.from_bop config.dt = DataType.gene fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [genes_from_bop]) config.dt = DataType.cpg
def Enrichment_Analyses_GO_terms(Name_Network, save_directory, Annotation_Directory, Original_Network_Name, enrichment="NO", MaxSize=500, MinSize=5, Repetitions=10, total_K=100, Rand_K_Compare=66, comparison="Bin_VS_Prob"): ## Preparation of the Network to Enrich ## # All the information of the data base: GO_Complete_Experimental = pd.read_csv(Annotation_Directory, sep="\t", header=0) GO_Complete_Experimental.drop("Level", inplace=True, axis=1) GO_Complete_Experimental.columns = ["Gene", "GO_Term"] # The genes of our Network: genes_Network = pd.read_csv(save_directory + "_Gene_Names_" + str(Original_Network_Name), sep=",", header=None, skiprows=[0]) genes_Network = pd.DataFrame({"Gene": genes_Network[1]}) # Annotation of the genes of our Network: GO_Join_Network = GO_Complete_Experimental[ GO_Complete_Experimental.Gene.isin(genes_Network.Gene)] # Filters: Cut_By = pd.DataFrame( GO_Join_Network.groupby('GO_Term')['Gene'].nunique(dropna=True)) Cut_By = Cut_By[Cut_By.Gene > MinSize] Cut_By = Cut_By[Cut_By.Gene < MaxSize] GO_Join_Network = GO_Join_Network[GO_Join_Network.GO_Term.isin( Cut_By.index)] ## Distances Preparation for clustering ## # Loading: Distance = pd.read_csv( save_directory + "_Result_Tijana_Final_" + Name_Network, sep=" ", header=0, ) Distance.set_index('1', inplace=True) ## Variables to store the results ## # Result DataFrames: Results_GO_Enrichment_Final = pd.DataFrame( columns=["K_Option", "Terms_Enriched"]) Results_Cluster_Enrichment_Final = pd.DataFrame( columns=["K_Option", "Cluster_Enriched"]) Results_Genes_Percent_Final = pd.DataFrame( columns=["K_Option", "Total_enriched"]) # Results for Rand_Index: Results_Rand_Index = pd.DataFrame(columns=["Option", "Cluster", "Terms"]) # Results for Gene: Results_Gene_GO_final = pd.DataFrame( columns=["Gene", "GO_Term", "Cluster", "Option", "Repetition"]) ## Enrichment Analyses ##: for Statistics in range(Repetitions): print("Repetition number", Statistics) # Per each repetition we should reload the variables: Results_Enrichment = pd.DataFrame(columns=[ "Option", "Enriched_GO", "Cluster", "Num_Genes_Annotated", "Term" ]) Results_Gene_GO = pd.DataFrame( columns=["Gene", "GO_Term", "Cluster", "Option", "Repetition"]) for option in range(1, total_K, 5): # First we start with the computing of the clusters medoids_ini = option print("K number", option) clusters = Cluster_Option(option, genes_Network, Distance, method_clust="Kmedoids") clusters_Data_frame = pd.DataFrame( columns=["Gene", "Cluster", "Option"]) for i in range(len(clusters)): gene_selection = genes_Network.iloc[clusters[i]] cluster_Repetition = np.repeat(i + 1, len(gene_selection)) Option_Repetition = np.repeat(option, len(gene_selection)) Iterator_DB = pd.DataFrame({ 'Gene': gene_selection["Gene"], 'Cluster': cluster_Repetition, "Option": Option_Repetition }) clusters_Data_frame = clusters_Data_frame.append(Iterator_DB) # Number of genes annotated and how many of them are in each category: Total_Annotated_Genes = GO_Join_Network["Gene"].nunique() Number_Genes_per_GO = GO_Join_Network.groupby( 'GO_Term')['Gene'].nunique(dropna=True) # With this information we go to the cluster: # For each cluster: for cluster in range(option): # Cluster Selection: selection_cluster = clusters_Data_frame[ clusters_Data_frame.Cluster == (cluster + 1)] # Annotation of genes in the cluster with GO: GO_Selected = GO_Join_Network[GO_Join_Network.Gene.isin( selection_cluster.Gene)] # Put inside of the external variable to keep info: Genes_itera = pd.DataFrame({ "Gene": GO_Selected.Gene, "GO_Term": GO_Selected.GO_Term, 'Cluster': cluster + 1, "Option": option, "Repetition": Statistics }) Results_Gene_GO = Results_Gene_GO.append(Genes_itera) # Number Genes with a concrete GO term in the cluster: k_selection = GO_Selected.groupby('GO_Term')['Gene'].nunique() K_and_k_data_frame = pd.merge(Number_Genes_per_GO, k_selection, on='GO_Term', how='right') # Total genes with annotation in the cluster: Total_Annotated_Cluster = GO_Selected["Gene"].nunique() # For the results of the enrichment in the cluster: Results_Enrichment_Cluster = pd.DataFrame( columns=["GO_Term", "p_value", 'Cluster', 'Option']) # For each GO term in the cluster: # Probabilistic: for GO_term in range(len(K_and_k_data_frame)): Enrich_GO = K_and_k_data_frame.iloc[GO_term] M = Total_Annotated_Genes k = Enrich_GO[0] N = Total_Annotated_Cluster X = Enrich_GO[1] p_value = hypergeom.sf(X - 1, M, k, N) results_db_iter = pd.DataFrame({ 'GO_Term': K_and_k_data_frame.index[GO_term], 'p_value': [p_value], 'Cluster': [cluster + 1], 'Option': [medoids_ini] }) Results_Enrichment_Cluster = Results_Enrichment_Cluster.append( results_db_iter) if len(K_and_k_data_frame ) != 0: # To avoid errors with empty clusters p_value_Correction = multipletests( Results_Enrichment_Cluster["p_value"], alpha=0.01, method='fdr_bh', is_sorted=False, returnsorted=False) Results_Enrichment_Cluster[ 'p_value_Correction'] = p_value_Correction[1] count_GO_Enriched = sum( Results_Enrichment_Cluster['p_value_Correction'] < 0.05 ) names_GO_Enriched = Results_Enrichment_Cluster[ Results_Enrichment_Cluster.p_value_Correction < 0.05] names_GO_Enriched = list(names_GO_Enriched["GO_Term"]) Results_itera_Enrich = pd.DataFrame({ 'Option': [medoids_ini], "Enriched_GO": [count_GO_Enriched], 'Cluster': [cluster + 1], "Num_Genes_Annotated": [Total_Annotated_Cluster], "Term": [names_GO_Enriched] }) Results_Enrichment = Results_Enrichment.append( Results_itera_Enrich) elif len(K_and_k_data_frame) == 0: Results_itera_Enrich = pd.DataFrame({ 'Option': [medoids_ini], "Enriched_GO": [0], 'Cluster': [cluster + 1], "Num_Genes_Annotated": [Total_Annotated_Cluster], "Term": [[]] }) Results_Enrichment = Results_Enrichment.append( Results_itera_Enrich) # Calculations per each iteration: GO_percent = Function_GO_enriched(Results_Enrichment, GO_Join_Network["GO_Term"].nunique(), total_K) Cluster_percent = Function_Calculate_Clust_Perc( Results_Enrichment, total_K) Genes_percent = Function_Gene_enriched(Results_Gene_GO, Results_Enrichment, total_K, genes_Network) Results_GO_Enrichment_Final = Results_GO_Enrichment_Final.append( GO_percent) Results_Cluster_Enrichment_Final = Results_Cluster_Enrichment_Final.append( Cluster_percent) Results_Genes_Percent_Final = Results_Genes_Percent_Final.append( Genes_percent) # For Rand: Rand = pd.DataFrame({ "Option": Results_Enrichment.Option, "Cluster": Results_Enrichment.Cluster, "Terms": Results_Enrichment.Term }) Results_Rand_Index = Results_Rand_Index.append(Rand) # For Cluster. Results_Gene_GO_final = Results_Gene_GO_final.append(Results_Gene_GO) # Only a concrete k value to compare with The Rand index: #Results_Rand_Index = Results_Rand_Index[Results_Rand_Index.Option == Rand_K_Compare] Results_GO_Enrichment_Final.drop("K_Option", inplace=True, axis=1) Results_GO_Enrichment_Final.drop("Terms_Enriched", inplace=True, axis=1) Results_Cluster_Enrichment_Final.drop("Cluster_Enriched", inplace=True, axis=1) Results_Cluster_Enrichment_Final.drop("K_Option", inplace=True, axis=1) # Save files just in case: Results_GO_Enrichment_Final.to_csv(save_directory + "_Enrichment_GO" + "_" + enrichment + "_" + Name_Network + ".txt", header=True, index=False) Results_Cluster_Enrichment_Final.to_csv( save_directory + "_Enrichment_Cluster" + "_" + enrichment + "_" + Name_Network + ".txt", header=True, index=False) Results_Genes_Percent_Final.to_csv(save_directory + "_Enrichment_Genes" + "_" + enrichment + "_" + Name_Network + ".txt", header=True, index=False) # Save info about Genes and Clusters: Results_Gene_GO_final.to_csv(save_directory + "_Cluster_Information_" + "_" + enrichment + "_" + Name_Network + ".txt", header=True, index=False) # Save Rand: Results_Rand_Index.to_csv(save_directory + "_Rand_Information_" + "_" + enrichment + "_" + Name_Network + ".txt", header=True, index=False)
def pfam_hyg(pfam): k = gi1_pfams.count(pfam) M = len(full_pfams) N = len(gi1_pfams) n = full_pfams.count(pfam) p = hypergeom.sf(k=k, M=M, n=n, N=N) ratio = (float(k) / N) / (float(n) / M) return p, ratio pfams_pvals = {p: pfam_hyg(p)[0] for p in all_pfams} pfams_effect = {p: pfam_hyg(p)[1] for p in all_pfams} adj_pval = dict( zip(all_pfams, multi.multipletests(list(pfams_pvals.values()), method="fdr_bh")[1])) sigs = { k.split(".")[0]: { 'name': pfam2name[k.split(".")[0]], 'pval': pfams_pvals[k], 'adj.pval': v, 'ratio': pfams_effect[k] } for k, v in sorted(adj_pval.items(), key=lambda x: x[1]) if v < 0.05 } pfam_table = DataFrame.from_dict(sigs, orient='index') pfam_table = pfam_table.sort_values(by='adj.pval') pfam_table['group'] = [ "other transferase" if "ransferase" in p else "" for p in pfam_table.name ] pfam_table['group'] = [
def main(_): print("Loading data...") data = pd.read_csv(FLAGS.data, encoding="utf-8") print("%d Examples" % (len(set(data["id"])))) print("%d Annotations" % len(data)) os.makedirs(FLAGS.plot_dir, exist_ok=True) with open(FLAGS.target_file, "r") as f: all_targets = f.read().splitlines() all_targets_neutral = all_targets + ["neutral"] target2idx = {e: i for i, e in enumerate(all_targets)} print("%d Target Categories" % len(all_targets)) print("Processing data...") # Remove neutral labels data = data[data["neutral"] == 0] # Remove examples with no ratings (difficult examples) data = data[data[all_targets_neutral].sum(axis=1) != 0] # Convert into num_examples x num_raters x num_ratings format data = data.groupby("id").filter(lambda x: len(x) >= 3) id_groups = data.groupby("id") worker2examples = {} # dict mapping worker ids to (example, rater id) tuples max_num_raters = data.groupby("id").size().max() ratings = np.zeros( (len(id_groups), max_num_raters, len(all_targets))) # ignore "neutral" rater_msk = np.zeros( (len(id_groups), max_num_raters)) # for masking out non-existent raters print("Ratings shape", ratings.shape) # Get ratings and rater mask texts = [] for ex_idx, (_, g) in enumerate(id_groups): texts.append(g.iloc[0]["text"]) rater_count = 0 # iterate through workers for _, row in g.iterrows(): for e in all_targets: ratings[ex_idx, rater_count, target2idx[e]] = row[e] rater_msk[ex_idx, rater_count] = 1 worker_id = row["rater_id"] if worker_id in worker2examples: worker2examples[worker_id].append((ex_idx, rater_count)) else: worker2examples[worker_id] = [(ex_idx, rater_count)] rater_count += 1 print("Calculating leave-out (partial) correlations...") partial_corr_per_rater = [] corr_per_rater = [] for worker_id in worker2examples: partial_corrs, corrs = LeaveOut(ratings, rater_msk, worker2examples, worker_id) if len(partial_corrs) < len(all_targets): continue partial_corr_per_rater.append(partial_corrs) corr_per_rater.append(corrs) corr_per_rater = np.array(corr_per_rater) partial_corr_per_rater = np.array(partial_corr_per_rater) # Verify that there are no NaN values assert np.isnan(corr_per_rater).sum() == 0 # Apply Wilcoxon signed rank test to test significance of each dimension p_vals = np.apply_along_axis(wilcoxon, 0, partial_corr_per_rater)[1] # Apply Bonferroni correction reject, corr_pvals, _, newalpha = multipletests( p_vals, alpha=0.05, method="bonferroni") print("Which dimensions to keep?") print(reject) print(corr_pvals) print(newalpha) print("Running PPCA on all the data...") # Take all raters and split them randomly x = [] y = [] rater_counts = rater_msk.sum(axis=1).astype(int) all_ratings_avg = [] for i, ex in enumerate(ratings): # Get actual raters based on mask keep = [] for worker_rating in ex[:rater_counts[i]]: keep.append(list(worker_rating)) all_ratings_avg.append(list(np.array(keep).mean(axis=0))) # Shuffle raters randomly random.shuffle(keep) num_raters = len(keep) x.append(list(np.array(keep[:int(num_raters / 2)]).mean(axis=0))) y.append(list(np.array(keep[int(num_raters / 2):]).mean(axis=0))) x = np.array(x) y = np.array(y) all_ratings_avg = np.array(all_ratings_avg) w, v = PPCA(x, y) # final components (p-values determine which ones to keep) print("Plotting percentage of covariance explained...") PlotCovar(v) # Apply varimax rotation w_vari = Varimax(w) # Get mapping between ppcs and targets map_df = pd.DataFrame( w_vari, index=all_targets, columns=np.arange(len(all_targets))).round(4) # Sort to move values to diagonal map_df = map_df[list( np.argsort(map_df.apply(lambda x: pd.Series.nonzero(x)[0]).values)[0])] f = plt.figure(figsize=(10, 6), dpi=300) sns.heatmap( map_df, center=0, cmap=sns.diverging_palette(240, 10, n=50), yticklabels=all_targets) plt.xlabel("Component") plt.savefig( FLAGS.plot_dir + "/component_loadings.pdf", dpi=600, format="pdf", bbox_inches="tight") ppc2target = map_df.abs().idxmax().to_dict() target2ppc = {e: i for i, e in ppc2target.items()} print(ppc2target) print("Plotting frequency and mean left-out rater correlations...") corr_mean = corr_per_rater.mean(axis=0) corr_mean_ordered = [corr_mean[target2ppc[e]] for e in all_targets] df_plot = pd.DataFrame({ "target": all_targets, "agreement": corr_mean_ordered }) df_plot["count"] = df_plot["target"].map( data[all_targets].sum(axis=0).to_dict()) df_plot.sort_values("count", ascending=False, inplace=True) df_plot.to_csv(FLAGS.plot_dir + "/target_agreements.csv", index=False) # Get colors norm = plt.Normalize(df_plot["agreement"].min(), df_plot["agreement"].max()) sm = plt.cm.ScalarMappable(cmap="BuPu", norm=norm) sm.set_array([]) # Generate figure fig = plt.figure(dpi=600, figsize=(5, 6)) ax = sns.barplot( data=df_plot, y="target", x="count", orient="h", hue="agreement", palette="BuPu", dodge=False, edgecolor="black", linewidth=1) ax.get_legend().remove() ax.figure.colorbar(sm) plt.text(18000, 31, "Interrater\nCorrelation", ha="center") plt.xlabel("Number of Examples") plt.ylabel("") plt.draw() labels = [item.get_text() for item in ax.get_xticklabels()] ax.set_xticklabels(["%dk" % (int(int(label) / 1000)) for label in labels]) plt.tight_layout() fig.savefig( FLAGS.plot_dir + "/label_distr_agreement.pdf", dpi=600, format="pdf", bbox_inches="tight") print("Generating t-SNE plot...") # Get PPC scores for all examples all_ratings_avg = Demean(all_ratings_avg) # demean all ratings ppc_scores = all_ratings_avg.dot(w_vari) # project onto ppcs ppc_scores_abs = np.absolute(ppc_scores) # Load maximally distinct colors colors = pd.read_csv( FLAGS.rgb_colors, sep="\t", header=None, names=np.arange(3)) # Set colors (todo(ddemszky): add names to colors in file) palette_rgb = colors.values with open(FLAGS.target_color_order) as f: color_order = f.read().splitlines() ppc2color = {target2ppc[e]: i for i, e in enumerate(color_order)} # get rgb value for each example based on weighted average of top targets rgb_vals = [] hex_vals = [] top_categories = [] threshold = 0.5 # exclude points not loading on any of the top 10 categories counter = 0 rgb_max = 255 other_color = palette_rgb[len(all_targets), :] for i, scores in enumerate(ppc_scores_abs): top_ppcs = [ idx for idx in (-scores).argsort()[:2] if scores[idx] > threshold ] top_targets = ",".join([ppc2target[idx] for idx in top_ppcs ]) if top_ppcs else "other" top_categories.append(top_targets) if len(top_ppcs) < 1: # doesn't have top targets from list color = other_color # use grey counter += 1 else: # Weighted average of top targets (square->weighted average->square root) color_ids = [ppc2color[idx] for idx in top_ppcs] weights = [scores[idx] for idx in top_ppcs] # Need to round, otherwise floating point precision issues will result # in values slightly above 1 avg = np.round( np.sqrt( np.average( np.power(palette_rgb[color_ids] * rgb_max, 2), axis=0, weights=weights)) / rgb_max, 4) if (avg > 1).sum() > 0: print(avg) color = avg rgb_vals.append(list(color)) hex_vals.append("#%02x%02x%02x" % tuple(np.array(color * rgb_max, dtype=int))) rgb_vals = np.array(rgb_vals) # Create t-SNE model tsne_model = TSNE( perplexity=30, n_components=2, n_iter=1000, random_state=23, learning_rate=500, init="pca") new_values = tsne_model.fit_transform(ppc_scores) x = [] y = [] for value in new_values: x.append(value[0]) y.append(value[1]) # Put data in dataframe df = pd.DataFrame({ "x": x, "y": y, "color": hex_vals, "label(s)": top_categories, "text": texts }) df = df[df["label(s)"] != "other"] df["top_label"] = df["label(s)"].str.split(",").str[0] # Two selections: # - a brush that is active on the top panel # - a multi-click that is active on the bottom panel brush = alt.selection(type="interval") click = alt.selection_multi(encodings=["color"]) sample = df.sample(5000) # max 5000 examples can be plotted points = alt.Chart(sample).mark_point( filled=True, size=50).encode( x="x:Q", y="y:Q", color=alt.Color("color", scale=None), tooltip=["label(s)", "text"]).properties( width=700, height=600).add_selection(brush) # Bottom panel is a bar chart bars = alt.Chart(sample).mark_bar().encode( x="count()", y="top_label:N", color=alt.condition(click, alt.Color("color:N", scale=None), alt.value("lightgray")), ).transform_filter(brush.ref()).properties( width=700, selection=click) chart = alt.vconcat( points, bars, data=sample, title="t-SNE Projection of Examples") chart.save(FLAGS.plot_dir + "/tsne.html", format="html")
def get_score_df(self, correction_method=None): ''' :param correction_method: str or None, correction method from statsmodels.stats.multitest.multipletests 'fdr_bh' is recommended. :return: pd.DataFrame ''' # From https://people.kth.se/~lang/Effect_size.pdf # Shinichi Nakagawa1 and Innes C. Cuthill. Effect size, confidence interval and statistical # significance: a practical guide for biologists. 2007. In Biological Reviews 82. # # Modification: when calculating variance, an empty document is added to each set X = self._get_X().astype(np.float64) X = X / X.sum(axis=1) X[np.isnan(X)] = 0 cat_X, ncat_X = self._get_cat_and_ncat(X) empty_cat_X_smoothing_doc = np.zeros((1, cat_X.shape[1])) empty_ncat_X_smoothing_doc = np.zeros((1, ncat_X.shape[1])) smoothed_cat_X = np.vstack([empty_cat_X_smoothing_doc, cat_X]) smoothed_ncat_X = np.vstack([empty_ncat_X_smoothing_doc, ncat_X]) n1, n2 = float(smoothed_cat_X.shape[1]), float( smoothed_ncat_X.shape[1]) n = n1 + n2 m1 = cat_X.mean(axis=0).A1 m2 = ncat_X.mean(axis=0).A1 v1 = smoothed_cat_X.var(axis=0).A1 v2 = smoothed_ncat_X.var(axis=0).A1 s_pooled = np.sqrt(((n2 - 1) * v2 + (n1 - 1) * v1) / (n - 2.)) cohens_d = (m1 - m2) / s_pooled cohens_d_se = np.sqrt( ((n - 1.) / (n - 3)) * (4. / n) * (1 + np.square(cohens_d) / 8.)) cohens_d_z = cohens_d / cohens_d_se cohens_d_p = norm.sf(cohens_d_z) hedges_r = cohens_d * (1 - 3. / ((4. * (n - 2)) - 1)) hedges_r_se = np.sqrt(n / (n1 * n2) + np.square(hedges_r) / (n - 2.)) hedges_r_z = hedges_r / hedges_r_se hedges_r_p = norm.sf(hedges_r_z) score_df = pd.DataFrame( { 'cohens_d': cohens_d, 'cohens_d_se': cohens_d_se, 'cohens_d_z': cohens_d_z, 'cohens_d_p': cohens_d_p, 'hedges_r': hedges_r, 'hedges_r_se': hedges_r_se, 'hedges_r_z': hedges_r_z, 'hedges_r_p': hedges_r_p, 'm1': m1, 'm2': m2, }, index=self.corpus_.get_terms()).fillna(0) if correction_method is not None: from statsmodels.stats.multitest import multipletests score_df['hedges_r_p_corr'] = 0.5 for method in ['cohens_d', 'hedges_r']: score_df[method + '_p_corr'] = 0.5 score_df.loc[(score_df['m1'] != 0) | (score_df['m2'] != 0), method + '_p_corr'] = (multipletests( score_df.loc[(score_df['m1'] != 0) | (score_df['m2'] != 0), method + '_p'], method=correction_method)[1]) return score_df
def parse_IPMASS(t=None, mode='log2'): if t is None: t = 'T1' f = '/Share2/home/zhangqf7/gongjing/zebrafish/script/zebrafish_structure/data/IP-mass/%s_P_N.xlsx' % ( t) df = pd.read_excel(f) print "read: %s, num=%s" % (f, df.shape[0]) # cols_keep = ['Gene names', #'Q-value', 'Score', # 'LFQ intensity %s_N1'%(t), 'LFQ intensity %s_N2'%(t), 'LFQ intensity %s_N3'%(t), 'LFQ intensity %s_N4'%(t), # 'LFQ intensity %s_P1'%(t), 'LFQ intensity %s_P2'%(t), 'LFQ intensity %s_P3'%(t), 'LFQ intensity %s_P4'%(t),] # rep_N = ['LFQ intensity %s_N1'%(t), 'LFQ intensity %s_N2'%(t), 'LFQ intensity %s_N3'%(t), 'LFQ intensity %s_N4'%(t),] # rep_P = ['LFQ intensity %s_P1'%(t), 'LFQ intensity %s_P2'%(t), 'LFQ intensity %s_P3'%(t), 'LFQ intensity %s_P4'%(t),] rep_N = ['LFQ intensity %s_N1' % (t), 'LFQ intensity %s_N2' % (t)] rep_P = ['LFQ intensity %s_P1' % (t), 'LFQ intensity %s_P2' % (t)] cols_keep = ['Gene names'] + rep_N + rep_P # print df.head() # log2 first if mode == 'log2': for i in rep_N + rep_P: log2_ls = [] for v in list(df[i]): if float(v) == 0: log2_ls.append(0.001) else: log2_ls.append(np.log2(float(v))) df['log2(%s)' % (i)] = log2_ls df['sum(N)'] = df.loc[:, ['log2(%s)' % (i) for i in rep_N]].sum(axis=1) df['sum(P)'] = df.loc[:, ['log2(%s)' % (i) for i in rep_P]].sum(axis=1) df['mean(N)'] = df['sum(N)'] / 4.0 df['mean(P)'] = df['sum(P)'] / 4.0 df['sum(P)-sum(N)'] = df['sum(P)'] - df['sum(N)'] df['mean(P)-mean(N)'] = df['mean(P)'] - df['mean(N)'] # print df.head() rep_N_log2_ls = ['log2(%s)' % (i) for i in rep_N] rep_P_log2_ls = ['log2(%s)' % (i) for i in rep_P] pvalue = [] for index, row in df.iterrows(): rep_N_val_ls = [row[i] for i in rep_N_log2_ls] rep_P_val_ls = [row[i] for i in rep_P_log2_ls] s, p = stats.ttest_ind(rep_P_val_ls, rep_N_val_ls) if np.isnan(p): p = 1 pvalue.append(p) else: df['sum(N)'] = df.loc[:, ['%s' % (i) for i in rep_N]].sum(axis=1) df['sum(P)'] = df.loc[:, ['%s' % (i) for i in rep_P]].sum(axis=1) df['mean(N)'] = df['sum(N)'] / 4.0 df['mean(P)'] = df['sum(P)'] / 4.0 df['mean(N)'] = [1 if i == 0 else i for i in df['mean(N)']] df['mean(P)'] = [1 if i == 0 else i for i in df['mean(P)']] df['log2(mean(N))'] = np.log2(df['mean(N)']) df['log2(mean(P))'] = np.log2(df['mean(P)']) df['sum(P)-sum(N)'] = df['sum(P)'] - df['sum(N)'] df['mean(P)-mean(N)'] = df['mean(P)'] - df['mean(N)'] df['log2(mean(P)/mean(N))'] = df['log2(mean(P))'] - df['log2(mean(N))'] rep_N_log2_ls = ['log2(%s)' % (i) for i in rep_N] rep_P_log2_ls = ['log2(%s)' % (i) for i in rep_P] pvalue = [] for index, row in df.iterrows(): rep_N_val_ls = [row[i] for i in rep_N] rep_P_val_ls = [row[i] for i in rep_P] s, p = stats.ttest_ind(rep_P_val_ls, rep_N_val_ls) if np.isnan(p): p = 1 pvalue.append(p) # print pvalue qvalue = multi.multipletests(pvalue) # print qvalue df['pvalue'] = pvalue df['qvalue'] = qvalue[1] df['-log10(qvalue)'] = -np.log10(df['qvalue']) df['-log10(pvalue)'] = -np.log10(df['pvalue']) cols_calc = [ 'sum(N)', 'sum(P)', 'mean(N)', 'mean(P)', 'log2(mean(N))', 'log2(mean(P))', 'sum(P)-sum(N)', 'mean(P)-mean(N)', 'log2(mean(P)/mean(N))', 'pvalue', 'qvalue', '-log10(pvalue)', '-log10(qvalue)' ] df = df[cols_keep + cols_calc] df.to_excel( '/Share2/home/zhangqf7/gongjing/zebrafish/script/zebrafish_structure/data/IP-mass/%s_enrich_table.xlsx' % (t), header=True, index=False) df.head() fig, ax = plt.subplots(figsize=(6, 6)) x_col = 'log2(mean(P)/mean(N))' x_col = 'log2(mean(N))' y_col = '-log10(pvalue)' y_col = 'log2(mean(P))' df.plot(kind='scatter', x=x_col, y=y_col, ax=ax) ratio_max = max(df[x_col]) # plt.axvline(x=0, ymin=0, ymax=1, ls='--', color='grey') # plt.axhline(y=-np.log10(0.05), xmin=0, xmax=1, ls='--', color='grey') # ax.set_xlim(-ratio_max-1, ratio_max+1) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) plt.title('time: %s (n=%s)' % (t, df.shape[0])) savefn = '/Share2/home/zhangqf7/gongjing/zebrafish/script/zebrafish_structure/data/IP-mass/%s_enrich_pvalue.pdf' % ( t) df['Gene Names'] = [i.split(';')[0] for i in df['Gene names']] texts = [] for x, y, t in zip(df[x_col], df[y_col], df['Gene Names']): if y > -np.log10(0.05) and t == 'elavl1': # ax.annotate(t, (x, y), fs=3) texts.append(plt.text(x, y, t, fontsize=12)) # plt.tight_layout() adjust_text(texts, only_move={'text': 'x'}) plt.tight_layout() plt.savefig(savefn) plt.close() return df, rep_N_log2_ls, rep_P_log2_ls, df[cols_keep + cols_calc]
def build_graph(pairs_occurances,filtered_clustering_table,alpha,method,verb=False): """Calculate p-values for domain pairs based on filtered clustering table Parameters: pairs_occurances (str): filtered_clustering_table (int) : alpha (float) :(default is False) method (str) : verb (bool) : Returns: G (nx.network) : Raises: IOError: An error occurred accessing the bigtable.Table object. """ G=nx.Graph() #Weight edges based on co-occurance pairs_occurances['pair'] = zip(pairs_occurances.V1.values,pairs_occurances.V2.values) if method == 'pvalue': try: edges = pairs_occurances[pairs_occurances['pvalue'] < alpha]['pair'].apply(lambda x: ast.literal_eval(x)).values except: edges = pairs_occurances[pairs_occurances['pvalue'] < alpha]['pair'].values finally: risks = pairs_occurances[pairs_occurances['pvalue'] < alpha]['pvalue'].apply(lambda x: -np.log10(x)).astype(str).values weightedEdges = [ e + (b,) for e,b in zip(edges,risks)] else: (reject, pvals_correct,a,b) = multipletests(pairs_occurances.pvalue.values,alpha,method) pairs_occurances['pvalue_correct'] = pvals_correct pairs_occurances['reject'] = reject try: edges = pairs_occurances[pairs_occurances['reject']]['pair'].apply(lambda x: ast.literal_eval(x)).values except: edges = pairs_occurances[pairs_occurances['reject']]['pair'].values finally: risks = pairs_occurances[pairs_occurances['reject']]['pvalue_correct'].apply(lambda x: -np.log10(x)).astype(str).values weightedEdges = [ e + (b,) for e,b in zip(edges,risks)] G.add_weighted_edges_from(weightedEdges) log("Constructing network --> %s %s" % (method,alpha)) log("%s nodes and %s edges found..." % (len(G.nodes()),len(G.edges))) if verb: log('annotating netwrok file...') wellsReadsDict = filtered_clustering_table.astype(str).groupby('seed')['well'].apply( lambda x: set(x.tolist())).to_dict() s = pd.Series(wellsReadsDict) attr_dict = s.apply(lambda x: '_'.join(sorted(list(x),key=int ))).to_dict() nx.set_node_attributes(G, name='well', values=attr_dict) #extract attributes from filtered_clustering_table to graph centroids_indexs = filtered_clustering_table[filtered_clustering_table['type'] == 'S'].index for attr in ['seq','clusterSize','domain']: attr_dict = dict(zip(filtered_clustering_table.loc[centroids_indexs,'seed'],filtered_clustering_table.loc[centroids_indexs,attr])) nx.set_node_attributes(G, name=attr, values=attr_dict) nx.set_node_attributes(G, name='compressed', values=0) return G
def get_interactions(): df_path = '/Users/wrshoemaker/Desktop/ParEvol_test/data/Tenaillon_et_al/gene_by_pop.txt' df = pandas.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0) df_np = df.values df_np = numpy.transpose(df_np) genes = df.columns.to_list() gene_pairs = list(itertools.combinations(genes,2)) pairwise_null_dict = {} for gene_pair in gene_pairs: pairwise_null_dict[gene_pair] = [] mutal_info_matrix = get_mutual_information_binary_matrix(df_np) #mutal_info_matrix = numpy.cov(df_np) mutal_info_matrix_flat = mutal_info_matrix[numpy.triu_indices(mutal_info_matrix.shape[0], k = 1)] n_simulations = 10000 for i in range(n_simulations): if ( i % 1000 == 0) and (i>0): print("%d simulations complete!" % i) df_np_null = get_random_matrix(df_np) null_mutal_info_matrix = get_mutual_information_binary_matrix(df_np_null) #null_mutal_info_matrix = numpy.cov(df_np_null) null_mutal_info_matrix_flat = null_mutal_info_matrix[numpy.triu_indices(null_mutal_info_matrix.shape[0], k = 1)] for gene_pair_idx, gene_pair in enumerate(gene_pairs): pairwise_null_dict[gene_pair].append(null_mutal_info_matrix_flat[gene_pair_idx]) #print(pairwise_null_dict) p_values = [] for gene_pair_idx, gene_pair in enumerate(gene_pairs): null_array = numpy.asarray(pairwise_null_dict[gene_pair]) observed_mutual_info = mutal_info_matrix_flat[gene_pair_idx] p_value_gene_pair = (len(null_array[null_array > observed_mutual_info ]) + 1) / (n_simulations+1) p_values.append(p_value_gene_pair) # 63190 tests reject, pvals_corrected, alphacSidak, alphacBonf = multitest.multipletests(p_values, alpha=0.05, method='fdr_bh') significanat_interaction_dict = {} count = 0 for gene_pair_idx, gene_pair in enumerate(gene_pairs): observed_mutual_info = mutal_info_matrix_flat[gene_pair_idx] p_value_corrected = pvals_corrected[gene_pair_idx] if p_value_corrected >= 0.01: continue #if reject[gene_pair_idx] == True: # continue count += 1 if gene_pair[0] not in significanat_interaction_dict: significanat_interaction_dict[gene_pair[0]] = {} if gene_pair[1] not in significanat_interaction_dict: significanat_interaction_dict[gene_pair[1]] = {} significanat_interaction_dict[gene_pair[0]][gene_pair[1]] = observed_mutual_info significanat_interaction_dict[gene_pair[1]][gene_pair[0]] = observed_mutual_info df_significant = pandas.DataFrame.from_dict(significanat_interaction_dict) df_significant = df_significant.fillna(0) df_out = '/Users/wrshoemaker/Desktop/ParEvol_test/data/Tenaillon_et_al/significant_mutual_information_tenaillon.txt' df_significant.to_csv(df_out, sep = '\t', index = True)
def stats_test(results_dir, signal_root): signal_pvalues_df_path = results_dir / "signal_pvalues.csv" signal_adjusted_pvalues_df_path = results_dir / "signal_adjusted_pvalues.csv" if signal_pvalues_df_path.exists(): signal_pvalues_df = pd.read_csv(signal_pvalues_df_path, index_col=0) else: signal_pvalues_df = calc_signal_pvalues(signal_pvalues_df_path, signal_root) print("Processed hypothesis:", len(signal_pvalues_df)) print(signal_pvalues_df.head(10)) signal_pvalues_df.index = signal_pvalues_df.name signal_pvalues_df.drop("name", inplace=True, axis=1) print("Not corrected pval, first 10 lowerest pvalues:") signal_pvalues_df["min"] = signal_pvalues_df.min(axis=1) signal_pvalues_df_sorted_by_min = signal_pvalues_df.sort_values(by="min") signal_pvalues_df_sorted_by_min.to_csv(str(results_dir / "signal_pvalues_sorted.csv")) print(signal_pvalues_df_sorted_by_min.head(10).to_string(line_width=300)) # P-values correction # see: http://www.statsmodels.org/dev/_modules/statsmodels/stats/multitest.html print("Adjust pvalues..") signal_pvalues_bh_df = signal_pvalues_df.copy().drop("min", axis=1) for c in signal_pvalues_bh_df.columns: pvalues = signal_pvalues_bh_df.loc[:, c] pvalues_not_nan_mask = ~np.isnan(pvalues) pvalues_not_nan = pvalues[pvalues_not_nan_mask] _reject, pvalues_corrected, *_ = multipletests( pvals=pvalues_not_nan, # fdr_bh, holm-sidak, bonferroni alpha=0.05, method="fdr_bh" ) signal_pvalues_bh_df.loc[pvalues_not_nan_mask, c] = pvalues_corrected signal_pvalues_bh_df["min"] = signal_pvalues_bh_df.min(axis=1, skipna=True) signal_pvalues_bh_sorted_df = signal_pvalues_bh_df.sort_values(by="min") signal_pvalues_bh_sorted_df.to_csv(str(signal_adjusted_pvalues_df_path)) # Passing FDR correction signal_pvalues_bh_sorted_df_005 = signal_pvalues_bh_sorted_df[ signal_pvalues_bh_sorted_df["min"] < 0.05] print("Passing FDR 0.05 by any metric:", len(signal_pvalues_bh_sorted_df_005)) # print(signal_pvalues_bh_sorted_df_005.head(10).to_string(line_width=300)) print("Corrected, first 10 lowerest pvalues:") print(signal_pvalues_bh_sorted_df.head(10).to_string(line_width=300)) print("Same records, but original pvalues:") print(signal_pvalues_df.loc[signal_pvalues_bh_sorted_df.head(10).index, :].to_string( line_width=300)) # Plots: with PdfPages(str(results_dir / "signal_pvalues.pdf")) as pdf: for col in signal_pvalues_df.columns: loir.manhattan_plot( signal_pvalues_df.sort_values(by="min"), col, "Signal [{}] ODS vs YDS: Mann whitney u test pvalues".format(col), correction="Uncorrected", save_to=pdf ) loir.manhattan_plot( signal_pvalues_bh_sorted_df.sort_values(by="min"), col, "Signal [{}] ODS vs YDS: Mann whitney u test pvalues".format(col), correction="Benjamini–Hochberg corrected", save_to=pdf ) if (col != "min"): plot_signal_at_signif_loci("Uncorrected", signal_pvalues_df, col, pdf, signal_root) plot_signal_at_signif_loci("Benjamini–Hochberg corrected", signal_pvalues_bh_sorted_df, col, pdf, signal_root)
def start(self): self.print_arguments() print("Loading data") discovery_df = self.load_file(self.discovery_path, header=0, index_col=None) replication_df = self.load_file(self.bryois_path, header=0, index_col=0) print(discovery_df) print(replication_df) print("Pre-process the discovery data.") discovery_df = discovery_df.loc[ ~discovery_df["SNP"].str.contains("nors"), :] discovery_df.index = discovery_df["Gene"].str.split( ".", expand=True)[0] + "_" + discovery_df["SNP"].str.split( ":", expand=True)[2] discovery_df = discovery_df.loc[~discovery_df.index.duplicated(), :] discovery_cell_types = [ x.split(" ")[0] for x in discovery_df.columns if "pvalue" in x ] discovery_aa_dict = dict( zip(discovery_df.index, discovery_df["Allele assessed"])) discovery_index_columns = [ "Gene", "Gene symbol", "SNP", "Alleles", "Allele assessed" ] discovery_df.columns = [ "MetaBrain " + col if col not in discovery_index_columns else col for col in discovery_df.columns ] print("Pre-process the replication data.") # Translate the cell types. colnames = [] for col in replication_df.columns: found = False for bryois_ct, metabrain_ct in self.bryois_ct_trans.items(): if found: break if bryois_ct in col: colnames.append(col.replace(bryois_ct, metabrain_ct)) found = True if not found: colnames.append(col) replication_df.columns = colnames # Add the discovery affect allele. replication_df["discovery_aa"] = replication_df.index.map( discovery_aa_dict) # Flipping the beta's replication_df["flip"] = replication_df[ "effect_allele"] != replication_df["discovery_aa"] replication_cell_types = [ x.replace(" p-value", "") for x in replication_df if x.endswith(" p-value") ] for ct in replication_cell_types: replication_df.loc[:, "{} beta".format(ct)] = replication_df[ "{} beta".format(ct)] * replication_df["flip"].map({ True: -1, False: 1 }) # Remove unwanted columns. replication_df.drop(["flip", "SNP", "effect_allele", "discovery_aa"], axis=1, inplace=True) # Change the column names. replication_df.columns = [ "Bryois " + col for col in replication_df.columns ] # Add the sample size. replication_df["Bryois N"] = self.bryois_n print("Merging data.") df = discovery_df.merge(replication_df, left_index=True, right_index=True, how="left") print(df) print("Adding BH-FDR for the replication.") overlap_ct = list( set(discovery_cell_types).intersection( set(replication_cell_types))) overlap_ct.sort() for ct in overlap_ct: print("\t{}".format(ct)) df["Bryois {} BH-FDR".format(ct)] = np.nan discovery_mask = (df["MetaBrain {} BH-FDR".format(ct)] <= 0.05).to_numpy() print("\t Discovery N-ieqtls: {:,}".format( np.sum(discovery_mask))) replication_mask = ( ~df["Bryois {} p-value".format(ct)].isna()).to_numpy() mask = np.logical_and(discovery_mask, replication_mask) n_overlap = np.sum(mask) if n_overlap > 1: df.loc[ mask, "Bryois {} BH-FDR".format(ct)] = multitest.multipletests( df.loc[mask, "Bryois {} p-value".format(ct)], method='fdr_bh')[1] n_replicating = df.loc[ df["Bryois {} BH-FDR".format(ct)] <= 0.05, :].shape[0] print("\t Replication N-ieqtls: {:,} / {:,} [{:.2f}%]".format( n_replicating, n_overlap, (100 / n_overlap) * n_replicating)) print("Reordering columns") columns_of_interest = discovery_index_columns.copy() + [ "MetaBrain N", "MetaBrain HW pval", "MetaBrain Minor allele", "MetaBrain MAF", "Bryois N" ] for ct in overlap_ct: columns_of_interest.append("MetaBrain {} pvalue".format(ct)) columns_of_interest.append("MetaBrain {} BH-FDR".format(ct)) columns_of_interest.append( "MetaBrain {} interaction beta".format(ct)) colnames = columns_of_interest.copy() for ct in replication_cell_types: columns_of_interest.append("Bryois {} p-value".format(ct)) colnames.append("Bryois {} pvalue".format(ct)) if ct in overlap_ct: columns_of_interest.append("Bryois {} BH-FDR".format(ct)) colnames.append("Bryois {} BH-FDR".format(ct)) columns_of_interest.append("Bryois {} beta".format(ct)) colnames.append("Bryois {} eQTL beta".format(ct)) df = df.loc[:, columns_of_interest].copy() df.columns = colnames print(df) print("Saving output") exclude_in_excel = [ "MetaBrain N", "MetaBrain HW pval", "MetaBrain Minor allele", "MetaBrain MAF", "MetaBrain Overall z-score", "Bryois N" ] self.save_file(df=df, outpath=os.path.join(self.outdir, "bryois_replication.txt.gz"), index=False) self.save_file( df=df. loc[:, [col for col in df.columns if col not in exclude_in_excel]], outpath=os.path.join(self.outdir, "bryois_replication.xlsx"), index=False, sheet_name="Bryois et al. 2021") # df = self.load_file(os.path.join(self.outdir, "bryois_replication.txt.gz"), # header=0, # index_col=None) print("Visualizing") discovery_ct = set([ x.split(" ")[1] for x in df.columns if "MetaBrain" in x and "FDR" in x ]) replication_ct = set([ x.split(" ")[1] for x in df.columns if "Bryois" in x and "FDR" in x ]) overlap_ct = list(discovery_ct.intersection(replication_ct)) overlap_ct.sort() replication_stats_df = self.plot(df=df, cell_types=overlap_ct) self.save_file(df=replication_stats_df, outpath=os.path.join(self.outdir, "replication_stats.txt.gz")) # replication_stats_df = self.load_file(os.path.join(self.outdir, "replication_stats.txt.gz"), # header=0, # index_col=0) print("Replication stats") for label in replication_stats_df["label"].unique(): print("\t{}".format(label)) stats_df = replication_stats_df.loc[ replication_stats_df["label"] == label, :] stats_df_mean = stats_df[["variable", "value"]].groupby("variable").mean() for index, row in stats_df_mean.iterrows(): print("\t {}: {:.2f}".format(index, row["value"])) stats_df_sum = stats_df[["variable", "value"]].groupby("variable").sum() print("\t Overall concordance: {:,}/{:,} [{:.2f}%]".format( stats_df_sum.loc["N concordant", "value"], stats_df_sum.loc["N", "value"], (100 / stats_df_sum.loc["N", "value"]) * stats_df_sum.loc["N concordant", "value"])) print("")
def call_interactions(indir, outdir, chrom_lens, binsize, dist, neighborhood_limit_lower = 3, \ neighborhood_limit_upper = 5, rank = 0, n_proc = 1, max_mem = 2, logger = None): logger.set_rank(rank) try: os.makedirs(outdir) except: pass proc_chroms = get_proc_chroms(chrom_lens, rank, n_proc) #print(rank, proc_chroms) #sys.stdout.flush() for chrom in proc_chroms: logger.write(f'\tprocessor {rank}: computing for chromosome {chrom}', verbose_level=1, allow_all_ranks=True) #print(rank, chrom) #d = pd.read_csv(chrom_filename, sep = "\t", header = None, usecols = [0,1,2,3,4,5, num_cells + 6]) ##command = "awk -F '\t' '{print NF; exit}' " + chrom_filename ##proc_output = subprocess.check_output(command, shell = True, executable = "/bin/bash") ##num_cells = int(proc_output) - 7 chrom_filename = os.path.join( indir, ".".join([chrom, "normalized", "combined", "bedpe"])) with h5py.File(chrom_filename + ".cells.hdf", 'r') as ifile: num_cells = ifile[chrom].shape[1] logger.write(f'\tprocessor {rank}: detected {num_cells} cells for chromosome {chrom}', \ append_time = False, allow_all_ranks = True, verbose_level = 2) #print('num_cells', num_cells) #sys.stdout.flush() d = pd.read_csv(chrom_filename, sep="\t", header=None) #num_cells = d.shape[1] - 7 matrix_max_size = determine_dense_matrix_size(num_cells, dist, binsize, max_mem) #print(rank, matrix_max_size) submatrices = convert_sparse_dataframe_to_dense_matrix(d, matrix_max_size, \ dist, binsize, neighborhood_limit_upper, \ num_cells, chrom_lens[chrom], chrom_filename) max_distance_bin = dist // binsize results = [] #print(matrix_max_size, neighborhood_limit_upper, neighborhood_limit_lower) #neighbor_counts_matrix = get_neighbor_counts_matrix((matrix_max_size + neighborhood_limit_upper * 2, \ # matrix_max_size + neighborhood_limit_upper * 2), \ # neighborhood_limit_upper, \ # neighborhood_limit_lower, max_distance_bin) #print('num zeros_2d', len(np.where(neighbor_counts_matrix==0)[0])) #print('going in for') #sys.stdout.flush() for i, (submatrix, start_index) in enumerate(submatrices): logger.write(f'\tprocessor {rank}: computing background for batch {i} of {chrom}, start index = {start_index}', \ verbose_level = 3, allow_all_ranks = True, append_time = False) #print('iteration', i) #print('start_index', start_index) #sys.stdout.flush() if i > 0: limit = i * (matrix_max_size - max_distance_bin ) #- neighborhood_limit_upper #results[-1] = results[-1][results[-1]['i'] < limit] results[-1] = results[-1][results[-1]['i'] < start_index] #start_index = i * (matrix_max_size - max_distance_bin) - neighborhood_limit_upper #print(start_index) submat_result = compute_significances(submatrix, neighborhood_limit_upper, \ neighborhood_limit_lower, num_cells, start_index, \ max_distance_bin) #print('returned') results.append(submat_result) #print(rank, 'offtheloop') #print(rank, len(results)) results = pd.concat(results, axis=0) #print(rank, chrom, results.shape[0]) min_index = 0 max_index = results['j'].max() #print(max_index, min_index, results['i'].dtype, results['j'].dtype, neighborhood_limit_upper) results = results[(results['i'] >= min_index + neighborhood_limit_upper) & \ (results['j'] <= max_index - neighborhood_limit_upper)] #print(results.shape[0]) def compute_fdr_by_dist(d): fdrs = multipletests(list(d['pvalue']), method='fdr_bh')[1] d.loc[:, 'fdr_dist'] = fdrs return d results.reset_index(drop=True, inplace=True) results = results.groupby(results['j'] - results['i'], as_index=False).apply(compute_fdr_by_dist) results.loc[:, 'fdr_chrom'] = multipletests(list(results['pvalue']), method='fdr_bh')[1] results.loc[:, 'i'] = (results['i'] * binsize).astype(int) results.loc[:, 'j'] = (results['j'] * binsize).astype(int) #print('finishing', d.shape) d = d.iloc[:, list(range(7))] d.columns = ['chr1', 'x1', 'x2', 'chr2', 'y1', 'y2', 'outlier_count'] #print(d.head()) #print(results.head()) #d = d.merge(results, left_on = ['x1', 'y1'], right_on = ['i', 'j'], how = "outer") d = d.merge(results, left_on=['x1', 'y1'], right_on=['i', 'j']) #print(d.shape) d.drop(['i', 'j'], axis=1, inplace=True) logger.write(f'\tprocessor {rank}: computation for {chrom} completed. writing to file.', \ append_time = False, allow_all_ranks = True, verbose_level = 2) d.to_csv(os.path.join(outdir, ".".join(["significances", chrom, "bedpe"])), sep="\t", index=False)
def adjust_pvalues(pvalues, FDR=0.05): """Correct p values with the Benjamini-Hochberg correction method.""" values = [x[1] for x in pvalues] adjusted_values = multipletests(values, alpha=FDR, method="fdr_bh") return [(pvalues[i][0], adjusted_values[1][i]) for i in range(len(pvalues))]