def calc_overlap_stats(test_set, geneset_dict, total_genes): """ get the overlaps and compute hypergeometric stats""" overlaps = get_overlaps(test_set, geneset_dict) p = overlaps.apply( lambda x: ( scipy.stats.hypergeom.sf( x.ix["match_count"]-1, # number of differentially expressed genes in set total_genes, # total number of genes x.ix["size of set"], # number of genes in current set len( test_set ))), # total number of genes in test set axis=1) p = pd.DataFrame(p, columns=["hypergeom p-val"]) overlaps = overlaps.select(lambda x: overlaps.ix[x, "match_count"] > 0) overlaps = overlaps.merge(p, left_index=True, right_index=True).sort("hypergeom p-val", ascending=True) if len(overlaps.index) > 0: overlaps["bonferroni"] = multicomp.multipletests(overlaps.ix[:,"hypergeom p-val"], method="bonferroni")[1] overlaps["b-h fdr adj pval"] = multicomp.multipletests( overlaps.ix[:,"hypergeom p-val"].fillna(1.0), method="fdr_bh")[1] return overlaps.sort("hypergeom p-val", ascending=True)
def adjustPvalue(s, preAdjusted, layoutAware): if layoutAware: idx = 2 else: idx = 1 try: # Some hosts do not have this library. If not we don't adjust import statsmodels.sandbox.stats.multicomp as multicomp adjust = True except Exception: adjust = False with open(s.file, 'w') as f: f = csv.writer(f, delimiter='\t') preAdjVals = [] for i, row in enumerate(preAdjusted): if not adjust: # No adjustment will happen so just write a NaN value to # the file so the UI won't try to display this value f.writerow(row + [float('NaN')]) continue # Extract the p-values from the data. # Translate NaNs to one so the stats routine will take it. if math.isnan(row[idx]): preAdjVals.append(1) else: preAdjVals.append(row[idx]) if not adjust: return try: # Benjamini-Hochberg FDR correction for p-values returns: # [reject, p_vals_corrected, alphacSidak, alphacBonf] # http://statsmodels.sourceforge.net/devel/generated/statsmodels.sandbox.stats.multicomp.multipletests.html reject, adjPvals, alphacSidak, alphacBonf = multicomp.multipletests(preAdjVals, alpha=0.05, method='fdr_bh') except Exception: adjPvals = [1 for x in preAdjusted] try: # Bonferroni correction for p-values returns: # [reject, p_vals_corrected, alphacSidak, alphacBonf] # http://statsmodels.sourceforge.net/devel/generated/statsmodels.sandbox.stats.multicomp.multipletests.html reject, adjPvalsB, alphacSidak, alphacBonf = multicomp.multipletests(preAdjVals, alpha=0.05, method='bonferroni') except Exception: adjPvalsB = [1 for x in preAdjusted] for i, row in enumerate(preAdjusted): f.writerow(row + [sigDigs(adjPvals[i]), sigDigs(adjPvalsB[i])])
def __enrich_counts(db, to_r, query, pval_cutoff): to_r["pval"] = to_r.apply(compute_p, axis=1, M=db.bicluster_info.count(), N=query.shape[0]) to_r["qval_BH"] = multipletests(to_r.pval, method='fdr_bh')[1] to_r["qval_bonferroni"] = multipletests(to_r.pval, method='bonferroni')[1] to_r = to_r.sort_values(["pval","counts"], ascending=True) # only return below pval cutoff to_r = to_r.loc[to_r.pval <= pval_cutoff, :] to_r.index = map(int, to_r.index) # make sure GRE ids are integers return to_r
def hypergeometric_significant_celltypes(self): ''' hypergeometric test for significance of celltype enrichment. ''' print('Testing celltype enrichment....') sigcelltype = self.sigCelltypedf cellgroup = self.cellgenedf.groupby(self.cellgenedf['celltype']) totalgenes = self.occurrencedf.shape[0] allsiggenes = self.cellgenedf allsiggenes = allsiggenes[allsiggenes['FDR'] <= 0.05] allsiggenes = len(set(allsiggenes['gene'])) sigcelltype.loc[:, 'hyper_pval'] = 1 col = sigcelltype.columns.get_loc('hyper_pval') for index, row in sigcelltype.iterrows(): #print(row['celltype']) #print(row['genecluster'], totalgenes, len(cellgroup.get_group(row['celltype'])), allsiggenes) ## stats.hypergeom.sf(x, M, n, N) hyper_pval = stats.hypergeom.sf(row['genecluster']-1, totalgenes, allsiggenes, len(cellgroup.get_group(row['celltype']))) #print(hyper_pval) sigcelltype.iloc[index, col] = hyper_pval sigcelltype.loc[:, 'hyper_FDR'] = 1 #ind_fdr = sigcelltype.columns.get_loc('hyper_FDR') sigcelltype = sigcelltype.sort_values('hyper_pval', ascending=True) sigcelltype.index = range(len(sigcelltype)) pvals = sigcelltype['hyper_pval'].values corr_pvals = statsmodels.multipletests(pvals=pvals, alpha=0.05, method='fdr_bh') sigcelltype['hyper_FDR'] = corr_pvals[1] self.sigCelltypedf = sigcelltype
def celltype_overrepresntation_list(self, enrichmentdf): ''' This method will save the result of significance in one DF. ''' significance = 1 column = ['celltype', 'gene', 'enrichment', 'binom_pval', 'FDR'] cellgenedf = pd.DataFrame() #print(self.binom_pval_df.head()) for gene, celltype in self.binom_pval_df.iterrows(): for cell, pval in celltype.iteritems(): if pval < significance: cellgenedf = cellgenedf.append( pd.Series([cell, gene, enrichmentdf.loc[gene, cell], pval, 0]), ignore_index=True) #print cellgenedf.head(10) cellgenedf.columns = column cellgenedf = cellgenedf.sort_values(['celltype', 'binom_pval'], ascending=[True, True]) cellgenedf.index = range(len(cellgenedf)) pvals = cellgenedf['binom_pval'].values corr_pvals = statsmodels.multipletests(pvals=pvals, alpha=0.05, method='fdr_bh') #print(pvals) #print(corr_pvals) cellgenedf['FDR'] = 0 cellgenedf['FDR'] = corr_pvals[1] ''' for ind, row in cellgenedf.iterrows(): fdr = (row['Binom p-val'] * len(cellgenedf)) / (ind + 1) cellgenedf.iloc[ind, 4] = fdr ''' print('cellgenedf shape:', cellgenedf.shape) #cellgenedf = self.filter_df(cellgenedf) self.cellgenedf = cellgenedf print('cellgenedf shape after:', cellgenedf.shape) #self.filter_cellgenedf() # Filter single cell multigene enrihment self.overall_significant_celltypes()
def run(study, pop, gene_set, adjust='fdr_bh'): ''' Run a Over-represent analysis toward a gene set :param study: the significant gene set :param pop: the background gene set :param gene_set: the function set :param adjust: the adjust method in the multiple tests, details in http://www.statsmodels.org/dev/generated/statsmodels.sandbox.stats.multicomp.multipletests.html :return: the ORA analysis result ''' gene_sets = gene_set if type(gene_set) == dict else GMTUtils.parse_gmt_file(gene_set) mapped = {k: list(set(v) & set([str(x) for x in pop])) for k, v in gene_sets.items()} s_mapped = {k: list(set(v) & set([str(x) for x in study])) for k, v in gene_sets.items()} result = {} for k, v in mapped.items(): result[k] = stats.hypergeom.sf(len(s_mapped[k]) - 1, len(pop), len(mapped[k]), len(study)) _, o, _, _ = multicomp.multipletests(list(result.values()), method=adjust) rfdr = {list(result.keys())[i]: o[i] for i in range(len(list(result.keys())))} # ! df_result = {'name': [], 'mapped': [], 'number in study': [], 'p-value': [], 'fdr': []} for k, v in mapped.items(): df_result['name'].append(k) df_result['mapped'].append(len(mapped[k])) df_result['number in study'].append(len(s_mapped[k])) df_result['p-value'].append(result[k]) df_result['fdr'].append(rfdr[k]) df = pd.DataFrame(df_result) df = df[['name', 'mapped', 'number in study', 'p-value', 'fdr']] return ORA(df, study, pop, adjust)
def combine(self, results): """ Stouffer combination of zscores :param results: :return: """ zscores = results.sum(axis=1) / np.sqrt(results.count(axis=1)) size = zscores.size is_nan = zscores.mask valid_indices = np.where(~is_nan) invalid_indices = np.where(is_nan) pv = stats.norm.sf(zscores[valid_indices]) pvalues = np.empty(size) pvalues[valid_indices] = pv pvalues[invalid_indices] = np.nan if pv.size != 0: qv = multipletests(pv, method='fdr_bh')[1] else: qv = np.array([]) qvalues = np.empty(size) qvalues[valid_indices] = qv qvalues[invalid_indices] = np.nan return np.array([zscores, pvalues, qvalues])
def stat_test(f, test_name, test, fdr): print('Testing', test_name, f, 'fdr', fdr) df = pd.read_csv(f, sep='\t') # Drop contigs df = df.loc[[bool(re.match('chr[0-9XYM]+$', c)) for c in df['chr']]] ods = [c for c in df.columns.values if is_od(c)] yds = [c for c in df.columns.values if is_yd(c)] pvals = np.array([test(row[ods], row[yds]) for _, row in df.iterrows()]) res = multipletests(pvals, fdr, "fdr_bh") h0_rejects = res[0] pvals_adj = res[1] df['pval'] = pvals df['pval_adj'] = pvals_adj df['od_mean'] = df[ods].mean(axis=1).to_frame('od_mean')['od_mean'] df['yd_mean'] = df[yds].mean(axis=1).to_frame('yd_mean')['yd_mean'] df['logfc'] = np.log(df['od_mean'] / df['yd_mean']) # Sort by pvalue pvals_order = pvals.argsort() df = df.loc[pvals_order] h0_rejects = h0_rejects[pvals_order] # Save results results = re.sub(r'\.tsv', '_{}.tsv'.format(test_name), f) df[['chr', 'start', 'end', 'yd_mean', 'od_mean', 'logfc', 'pval', 'pval_adj']] \ .to_csv(results, sep='\t', index=None, header=True) print('Saved test results to', results) # Save significant results if sum(h0_rejects) > 0: results_fdr = re.sub(r'\.tsv', '_{}_diff_fdr_{}.bed'.format(test_name, fdr), f) df.loc[h0_rejects][['chr', 'start', 'end']] \ .to_csv(results_fdr, sep='\t', index=None, header=True) print('Saved {} significant results at FDR={} to {}'.format( sum(h0_rejects), fdr, results_fdr))
def fit(self, df_X, df_y): if not df_y.shape[0] == df_X.shape[0]: raise ValueError("number of regions is not equal") if df_y.shape[1] != 1: raise ValueError("y needs to have 1 label column") # calculate Mann-Whitney U p-values pvals = [] clusters = df_y[df_y.columns[0]].unique() for cluster in clusters: pos = df_X[df_y.iloc[:,0] == cluster] neg = df_X[df_y.iloc[:,0] != cluster] p = [] for m in pos: try: p.append(mannwhitneyu(pos[m], neg[m], alternative="greater")[1]) except Exception as e: sys.stderr.write(str(e) + "\n") sys.stderr.write("motif {} failed, setting to p = 1\n".format(m)) p.append(1) pvals.append(p) # correct for multipe testing pvals = np.array(pvals) fdr = multipletests(pvals.flatten(), method="fdr_bh")[1].reshape(pvals.shape) # create output DataFrame self.act_ = pd.DataFrame(-np.log10(pvals.T), columns=clusters, index=df_X.columns)
def combine(self, results): """ Fisher's combination of pvalues :param results: :return: """ results = np.copy(results) results[results < PVALUE_EPSILON] = PVALUE_EPSILON log = np.ma.log(results) s = log.sum(axis=1) count = log.count(axis=1) size = s.size is_nan = s.mask valid_indices = np.where(~is_nan) invalid_indices = np.where(is_nan) pv = 1.0 - stats.chi2.cdf(-2.0 * s[valid_indices], 2 * count[valid_indices]) pvalues = np.empty(size) pvalues[valid_indices] = pv pvalues[invalid_indices] = np.nan if pv.size != 0: qv = multipletests(pv, method='fdr_bh')[1] else: qv = np.array([]) qvalues = np.empty(size) qvalues[valid_indices] = qv qvalues[invalid_indices] = np.nan return np.array([pvalues, qvalues])
def perform_multiple_comparison_stat(data1,data2, alpha=0.05): """ :param data1: :param data2: :return: True if they are statistically different """ mat1 = np.array(data1) mat2 = np.array(data2) comparisons = len(data1[0]) pvals = [ttest_ind(mat1[:,i].tolist(),mat2[:,i])[1] for i in range(comparisons)] mult_comparison = multipletests(pvals, alpha=alpha) #print(mult_comparison) print(mult_comparison[0]) """Version where just once is enough for val in mult_comparison[0]: if val == True: return True return False """ # Version where the number of trues must exceed alpha (useful when you have A LOT of elements) true_counter = 0 for val in mult_comparison[0]: if val == True: true_counter += 1 return True if true_counter/len(mult_comparison[0]) >= alpha else False
def binom_significant_celltypes(self): ''' Binomial test for significance of celltype enrichment. ''' print('Testing celltype enrichment....') sigcelltype = self.sigCelltypedf cellgroup = self.cellgenedf.groupby(self.cellgenedf['celltype']) binom_prob_occu = self.binom_prob_occu sigcelltype.loc[:, 'binom_pval'] = 1 col = sigcelltype.columns.get_loc('binom_pval') for index, row in sigcelltype.iterrows(): #print(row['celltype']) #print(row['genecluster'], totalgenes, len(cellgroup.get_group(row['celltype'])), allsiggenes) bprob_ind = binom_prob_occu[binom_prob_occu['celltype'] == row['celltype']].index[0] #print(bprob_ind) background_prob = binom_prob_occu.loc[bprob_ind, 'background_prob'] #print(background_prob) binom_pval = stats.binom_test(row['genecluster']-1, len(cellgroup.get_group(row['celltype'])), background_prob, alternative='two-sided') sigcelltype.iloc[index, col] = binom_pval sigcelltype.loc[:, 'binom_FDR'] = 1 sigcelltype = sigcelltype.sort_values('binom_pval', ascending=True) sigcelltype.index = range(len(sigcelltype)) pvals = sigcelltype['binom_pval'].values corr_pvals = statsmodels.multipletests(pvals=pvals, alpha=0.05, method='fdr_bh') #print(pvals) #print(corr_pvals) sigcelltype['binom_FDR'] = corr_pvals[1] self.sigCelltypedf = sigcelltype
def test_multi_pvalcorrection(): # test against R package multtest mt.rawp2adjp # because of sort this doesn't check correct sequence - TODO: rewrite DONE rmethods = { "rawp": (0, "pval"), "Bonferroni": (1, "b"), "Holm": (2, "h"), "Hochberg": (3, "sh"), "SidakSS": (4, "s"), "SidakSD": (5, "hs"), "BH": (6, "fdr_i"), "BY": (7, "fdr_n"), } for k, v in rmethods.items(): if v[1] in ["b", "s", "sh", "hs", "h", "fdr_i", "fdr_n"]: # pvalscorr = np.sort(multipletests(pval0, alpha=0.1, method=v[1])[1]) r_sortindex = [6, 8, 9, 7, 5, 1, 2, 4, 0, 3] pvalscorr = multipletests(pval0, alpha=0.1, method=v[1])[1][r_sortindex] assert_almost_equal(pvalscorr, res_multtest[:, v[0]], 15) pvalscorr = np.sort(fdrcorrection0(pval0, method="n")[1]) assert_almost_equal(pvalscorr, res_multtest[:, 7], 15) pvalscorr = np.sort(fdrcorrection0(pval0, method="i")[1]) assert_almost_equal(pvalscorr, res_multtest[:, 6], 15)
def calculate_quantile_pvalue( self, quantile, minvalues=10 ): # Check arguments if isinstance(quantile, float): quantile = [quantile] elif isinstance(quantile, list): for q in quantile: if not isinstance(q, float): raise TypeError('quantile list must contain floats') else: raise TypeError('quantile must be float or list of floats') # Create colnames for output dataframe colNames = [] for condition in self.matrices: for sample in self.matrices[condition]: colNames.append('{}_{}_no'.format(condition, sample)) colNames.append('{}_{}_mean'.format(condition, sample)) for condition in self.matrices: colNames.append('{}_no'.format(condition)) colNames.append('{}_mean'.format(condition)) colNames.extend(['pvalue', 'fdr']) # Create output dataframe outDF = pd.DataFrame(index=quantile, columns=colNames) outDF = outDF.sort_index() # Extract quantile distance data quantData = self.extract_dist_quantile(quantile) splitQuant = quantData.groupby('quan') for q, data in splitQuant: # Extract data for conditions and samples condValues = [] for cond in self.matrices: # Extract data for condition condData = data[data['cond'] == cond] condDist = condData['dist'] condValues.append(condDist) # Add condition data to output colPrefix = '{}_'.format(cond) outDF.loc[q, colPrefix + 'no'] = condDist.size outDF.loc[q, colPrefix + 'mean'] = condDist.mean() for smpl in self.matrices[cond]: # Extract data for sample smplData = condData[condData['smpl'] == smpl] smplDist = smplData['dist'] # Add sample data to output colPrefix = '{}_{}_'.format(cond, smpl) outDF.loc[q, colPrefix + 'no'] = smplDist.size outDF.loc[q, colPrefix + 'mean'] = smplDist.mean() # Calculate pvalues dist1, dist2 = condValues if dist1.size >= minvalues and dist2.size >= minvalues: outDF.loc[q, 'pvalue'] = mannwhitneyu(dist1, dist2)[1] # Add fdr and return pvalueIndex = outDF.index[~outDF['pvalue'].isnull()] outDF.loc[pvalueIndex, 'fdr'] = multipletests( outDF.loc[pvalueIndex, 'pvalue'], method='fdr_bh')[1] return(outDF)
def calculate_gene_expression_similarity(reduced_stat_map_data, mask="full"): store_file = "/ahba_data/store_max1_reduced.h5" subcortex_mask = "/ahba_data/subcortex_mask.npy" results_dfs = [] with pd.HDFStore(store_file, 'r') as store: for donor_id in store.keys(): print "Loading expression data (%s)" % donor_id expression_data = store.get(donor_id.replace(".", "_")) print "Getting statmap values (%s)" % donor_id nifti_values = reduced_stat_map_data[expression_data.columns] print "Removing missing values (%s)" % donor_id na_mask = np.isnan(nifti_values) if mask == "subcortex": na_mask = np.logical_or(na_mask, np.isnan(np.load(subcortex_mask)[expression_data.columns])) elif mask == "cortex": na_mask = np.logical_or(na_mask, np.logical_not(np.isnan( np.load(subcortex_mask)[expression_data.columns]))) else: assert mask == "full" nifti_values = np.array(nifti_values)[np.logical_not(na_mask)] expression_data.drop(expression_data.columns[na_mask], axis=1, inplace=True) print "z scoring (%s)" % donor_id expression_data = pd.DataFrame(zscore(expression_data, axis=1), columns=expression_data.columns, index=expression_data.index) nifti_values = zscore(nifti_values) print "Calculating linear regressions (%s)" % donor_id regression_results = np.linalg.lstsq(np.c_[nifti_values, np.ones_like(nifti_values)], expression_data.T) results_df = pd.DataFrame({"slope": regression_results[0][0]}, index=expression_data.index) results_df.columns = pd.MultiIndex.from_tuples([(donor_id[1:], c,) for c in results_df.columns], names=['donor_id', 'parameter']) results_dfs.append(results_df) print "Concatenating results" results_df = pd.concat(results_dfs, axis=1) del results_dfs t, p = ttest_1samp(results_df, 0.0, axis=1) group_results_df = pd.DataFrame({"t": t, "p": p}, columns=['t', 'p'], index=expression_data.index) _, group_results_df["p (FDR corrected)"], _, _ = multipletests(group_results_df.p, method='fdr_bh') group_results_df["variance explained (mean)"] = (results_df.xs('slope', axis=1, level=1) ** 2 * 100).mean(axis=1) group_results_df["variance explained (std)"] = (results_df.xs('slope', axis=1, level=1) ** 2 * 100).std(axis=1) del results_df probe_info = pd.read_csv("/ahba_data/probe_info_max1.csv", index_col=0).drop(['chromosome', "gene_id"], axis=1) group_results_df = group_results_df.join(probe_info) group_results_df = group_results_df[["gene_symbol", "entrez_id.1", "gene_name","t", "p", "p (FDR corrected)", "variance explained (mean)", "variance explained (std)"]] return group_results_df
def bhCorrection(s, n=None): s = s.fillna(1.) if n > len(s): p_vals = list(s) + [1] * (n - len(s)) else: p_vals = list(s) q = multicomp.multipletests(p_vals, method='fdr_bh')[1][:len(s)] q = pd.Series(q[:len(s)], s.index, name='p_adj') return q
def qvalues(self, below=0.1): if not self._pvalues: self.pvalues() pvals = [x[1] for x in self._pvalues] qvals = list(multipletests(pvals, method='fdr_bh')[1]) res = [(q,p,x) for (q, (x,p)) in zip(qvals, self._pvalues) if q < below] self._qvalues = res log.notice('got %d peaks with qvalue below %.2f. From %d possible.' % ( len(res), below, len(pvals))) return res
def adjust_score(self, score): """ Returns a list of adjusted p-values. Currently only the Benjamini-Hochberg method is supported. :param score: the list of p-values to adjust :return: the list of adjusted p-values """ if self.args_dict['pvaladjust'] is None: return score else: return multipletests(score, alpha=self.args_dict['threshold'], method=self.args_dict['pvaladjust'])[1]
def bhCorrection(s, n=None): """ Benjamini-Hochberg correction for a Series of p-values. """ s = s.fillna(1.) if n > len(s): p_vals = list(s) + [1] * (n - len(s)) else: p_vals = list(s) q = multicomp.multipletests(p_vals, method='fdr_bh')[1][:len(s)] q = pd.Series(q[:len(s)], s.index, name='p_adj') return q
def pvalues(vect, vals, side="two-sided", significance_threshold=0.05, multi_tests_cor_method="fdr_bh"): """ Computes the pvalue of a given values in a vector. :param vect: the vector used to compute the pvalue :param vals: the values used to compute the pvalue (iterable) :param side: the sides to find the pvalue (can be 'two-sided', 'left' of 'right'). Default is 'two-sided' :param significance_threshold: the significance threshold. Default is 0.05 :param multi_tests_cor_method: multiple testing correction method. Default is 'fdr_bh' :returns: The multiple testing corrected pvalues results as dictionary key:val, values: {corrected_pvalue, description, corrected_signif}, Sidak corrected significance threshold (alpha), Bonferroni corrected alpha :raises: IOError: if vect or val are not of an appropriate type or if side is not one of: 'two-sided', 'left' or 'right' .. note:: multi_tests_cor_method parameter accepts the fllowing values: - **bonferroni** : one-step correction - **sidak** : one-step correction - **holm-sidak** : step down method using Sidak adjustments - **holm** : step-down method using Bonferroni adjustments - **simes-hochberg** : step-up method (independent) - **hommel** : closed method based on Simes tests (non-negative) - **fdr_bh** : Benjamini/Hochberg (non-negative) - **fdr_by** : Benjamini/Yekutieli (negative) - **fdr_tsbh** : two stage fdr correction (non-negative) - **fdr_tsbky** : two stage fdr correction (non-negative) """ pvals = [None] * len(vals) for i in range(len(vals)): val = vals[i] p, d, s = pvalue(vect, val, side=side, significance_threshold=significance_threshold) pvals[i] = dict(pvalue=p, description=d, significance=s) # Multiple testing correction if multi_tests_cor_method is not None: s, cp, alphacSidak, alphacBonf = multipletests( [pvals[i]["pvalue"] for i in range(len(vals))], alpha=significance_threshold, method=multi_tests_cor_method ) for i in range(len(vals)): pvals[i]["uncorrected_pvalue"] = pvals[i]["pvalue"] pvals[i]["pvalue"] = cp[i] pvals[i]["significance"] = s[i] d = pvals[i]["description"] pvals[i]["description"] = "%s %.3e" % (d[: d.rindex("=") + 1], cp[i]) if multi_tests_cor_method == "bonferroni": return pvals, alphacSidak if multi_tests_cor_method == "sidak" or multi_tests_cor_method == "holm-sidak": return pvals, alphacSidak return pvals
def calculate_dist_pvalue(self, rmzero=True, minvalues=10): # Extract distances for input matrices distProb = self.extract_dist_prob() splitDist = distProb.groupby('dist') # Create output columns colNames = [] for condition in self.matrices: for sample in self.matrices[condition]: colNames.append('{}_{}_no'.format(condition, sample)) colNames.append('{}_{}_mean'.format(condition, sample)) for condition in self.matrices: colNames.append('{}_no'.format(condition)) colNames.append('{}_mean'.format(condition)) colNames.extend(['pvalue', 'fdr']) # Create output dataframe outDF = pd.DataFrame( columns = colNames, index = splitDist.groups.keys()) outDF = outDF.sort_index() # Loop through data and calculate results for dist, data in splitDist: # Remove zero values if rmzero: data = data[data['prob'] > 0] # Extract data for conditions and samples condValues = [] for cond in self.matrices: # Extract data for condition condData = data[data['cond'] == cond] condProb = condData['prob'] condValues.append(condProb) # Add condition data to output colPrefix = '{}_'.format(cond) outDF.loc[dist, colPrefix + 'no'] = condProb.size outDF.loc[dist, colPrefix + 'mean'] = condProb.mean() for smpl in self.matrices[cond]: # Extract data for sample smplData = condData[condData['smpl'] == smpl] smplProb = smplData['prob'] # Add sample data to output colPrefix = '{}_{}_'.format(cond, smpl) outDF.loc[dist, colPrefix + 'no'] = smplProb.size outDF.loc[dist, colPrefix + 'mean'] = smplProb.mean() # Calculate pvalues prob1, prob2 = condValues if prob1.size >= minvalues and prob2.size >= minvalues: outDF.loc[dist, 'pvalue'] = mannwhitneyu(prob1, prob2)[1] # Sort data, add fdr and return pvalueIndex = outDF.index[~outDF['pvalue'].isnull()] outDF.loc[pvalueIndex, 'fdr'] = multipletests( outDF.loc[pvalueIndex, 'pvalue'], method='fdr_bh')[1] return(outDF)
def t_test(X, group): """ Simple two-group comparison with (unpaired) t-test. """ R = pd.DataFrame.from_records([], index=X.index) R["logFC"] = fold_change(X, group, log=2) R["logFC"] = R["logFC"].fillna(0) Xm = X.as_matrix() ix = group.as_matrix() t, p = ttest_ind(Xm[:,ix], Xm[:,~ix], axis=1) R["t"] = t R["p"] = p R["FDR"] = multipletests(R["p"], method="fdr_bh")[1] return R
def set_fisher(sets1, sets2, allgenes = None): if allgenes is None: allgenes = set() for k1, s1 in sets1.items(): allgenes |= set(s1) for k2, s2 in sets2.items(): allgenes |= set(s2) else: allgenes = set(allgenes) rv = [] for k1, s1 in sets1.items(): s1 = set(s1) & allgenes for k2, s2 in sets2.items(): s2 = set(s2) & allgenes a = s1 & s2 b = s1 - a c = s2 - a d = allgenes - (s1 | s2) oddsratio, pval = fisher_exact( [[len(a), len(b)], [len(c), len(d)]], alternative='two-sided') rv.append(pd.Series(dict( a=len(a), b=len(b), c=len(c), d=len(d), len1 = len(s1), s1=k1, len2=len(s2), s2=k2, reference = len(allgenes), oddsratio = oddsratio, pval=pval ))) rv = pd.DataFrame(rv) rv['padj_bh'] = multipletests(rv['pval'], method='fdr_bh')[1] rv['padj_bonf'] = multipletests(rv['pval'], method='bonferroni')[1] return rv
def filterExptsByPseudoCountDistr( ddict ): # remove experiments where the pseudocount is high # relative to the other pseudocounts pseudodict = { k : ddict[k]['PSEUDO'] for k in ddict } pskeys = list(pseudodict.keys()) pslogvals = np.log10(list(pseudodict.values())) pslogmad = mad(pslogvals) ; pslogmedian = np.percentile(pslogvals,50) pslvps_hi = 1-norm.cdf((pslogvals-pslogmedian)/pslogmad) rejected_ds_hi = multipletests( pslvps_hi, alpha=0.05 )[0] # return data in a dictionary filteredExpts = { pskeys[i] : rejected_ds_hi[i] for i in range(len(pskeys))} return filteredExpts
def fdr_qvals(obs, mc): ''' compute pvalues and fdr correct them. ''' def compute_pvalues(obs, mc): for o in obs: h0_greaterequal = len(mc) - bisect_left(mc, o) pvalue = float(h0_greaterequal + 1) / (len(mc) + 1) yield pvalue obs = np.sort(obs)[::-1] mc = np.sort(mc) pvals = list(compute_pvalues(obs, mc)) qvals = list(multipletests(pvals, method='fdr_bh')[1]) return dict(pvals=pvals, qvals=qvals)
def row_wise_anova(mat, categories, method='fdr_bh'): '''Apply one-way ANOVA to each row of mat, and adjust p-values. ''' uniq_cats = np.unique(categories) pvals = np.ones(mat.shape[0], dtype=float) masks = [np.in1d(categories, [cat]) for cat in uniq_cats] for i in range(mat.shape[0]): row = mat[i] grouped_row = [ row[mask] for mask in masks ] fval, pval = f_oneway(*grouped_row) pvals[i] = pval _, qvals, _, _ = multipletests(pvals, method=method) return pvals, qvals
def GLM (file, score, stat, ind_var, Level, betas=1): # Create pandas dataframe df_final= pd.DataFrame(columns=['Score', 'stat', 'beta', 'tvalue', 'pvalue' , 'pval_bonferroni', 'signi_bonferonni', 'Rsquare', 'std']) db = pd.read_csv(file) # Get rid of rows with null values for given columns db = db[db[score].notnull()] # Select Variables Y = np.array(db[score]) X = np.array(db[ind_var]) #Run the GLM model = sm.OLS(Y, X).fit() pvals = model.pvalues pvals_fwer = multicomp.multipletests(pvals, alpha = 0.05, method = 'fdr_bh') # Save it into csv file df_final.loc[len(df_final)] = [score, stat, model.params, model.tvalues, model.pvalues, pvals_fwer[1], pvals_fwer[0], model.rsquared, model.bse] df_final.to_csv(os.path.join(stat,score, score + "_" + stat + "_" + Level + ".csv")) # # check quickly if there is significant data # for idx, i in enumerate(model.pvalues): # if model.pvalues[i] < 0.05: # print (score+ " " + stat + " "+ Level + ind_var[idx] ) # print (model.pvalues[idx]) betas_component = model.params[0:betas] # ## plot the Betas # Select the variable y = model.tvalues x = np.array(range(len(ind_var))) # plot data plt.plot(x, y, linestyle="dashed", marker="o", color="green") plt.xticks(x, ind_var) plt.ylabel(score + "_" + stat) plt.xlabel("Rsquare %s" % (model.rsquared)) #plt.savefig(os.path.join(stat, score, score + "_" + stat + "_" + ".png")) plt.close() return df_final, db, betas_component , pvals
def parse_hwe(f, alpha, vcf_file): """ Parses a hardy-weinberg output file, corrects p-values according to a FDR and generates several plots to visualize the hwe results """ vcf_outfile = vcf_file.split(".")[0] + "_filtered.vcf" snp_pos = [] pvals = [] het_deficit = [] het_excess = [] with open(f) as fh: #Skip header next(fh) for line in fh: fields = line.strip().split() snp_pos.append((fields[0], fields[1])) pvals.append(float(fields[5])) het_deficit.append(float(fields[6])) het_excess.append(float(fields[7])) fdr_bool_list, fdr_pvalue_list, alpha_S, alpha_B = \ multi_correction.multipletests(pvals, alpha=float(alpha), method="fdr_bh") snp_pvals = OrderedDict() for pos, pval in zip(snp_pos, fdr_pvalue_list): snp_pvals["-".join(pos)] = pval with open(vcf_file) as vcf_fh, open(vcf_outfile, "w") as ofh: for line in vcf_file: if line.startswith("#"): ofh.write(line) elif line.strip() != "": fields = line.split() # Check pval for locus pos = "-".join(fields[0], fields[1]) if snp_pvals[pos] <= 0.05: ofh.write(line)
def lr_tests(sample_info, expression_matrix, full_model, reduced_model='expression ~ 1'): tmp = sample_info.copy() fit_results = pd.DataFrame(index=expression_matrix.index) gene = expression_matrix.index[0] tmp['expression'] = expression_matrix.ix[gene] m1 = smf.ols(full_model, tmp).fit() m2 = smf.ols(reduced_model, tmp).fit() for param in m1.params.index: fit_results['full ' + param] = np.nan params = m1.params.add_prefix('full ') fit_results.ix[gene, params.index] = params for param in m2.params.index: fit_results['reduced ' + param] = np.nan params = m2.params.add_prefix('reduced ') fit_results.ix[gene, params.index] = params fit_results['pval'] = np.nan fit_results.ix[gene, 'pval'] = m1.compare_lr_test(m2)[1] for gene in tqdm(expression_matrix.index[1:]): tmp['expression'] = expression_matrix.ix[gene] m1 = smf.ols(full_model, tmp).fit() params = m1.params.add_prefix('full ') fit_results.ix[gene, params.index] = params m2 = smf.ols(reduced_model, tmp).fit() params = m2.params.add_prefix('reduced ') fit_results.ix[gene, params.index] = params fit_results.ix[gene, 'pval'] = m1.compare_lr_test(m2)[1] fit_results['qval'] = multipletests(fit_results['pval'], method='b')[1] return fit_results
def get_state(statename, winner, d): '''return table with stat significant for each state''' if len(winner[winner['state'] == statename]) < 3: return #get avarage post price and standard deviation state_mean = winner[winner['state'] == statename]['price'].mean() state_sd = winner[winner['state'] == statename]['price'].std() # get sample tests that have size larger than 20 and avergae post price larger than state mean # 20 is to make sure it's ok to perform a hypothesis test df = d[(d['state'] == statename) & (d['price']['count'] > 20)&(d['price']['mean'] > state_mean)] # If the distribution of all state price is normal, then we can use z test. If not, use a non-paramatric hypothesis testing # if the population size is larger than 5000, assume normal, if not, run stats.shapiro to test normality. if len(winner[winner['state'] == statename]) < 5000: stat, pval = stats.shapiro(winner[winner['state'] == statename]['price']) if pval > 0.05: # print "it's normal" z_test(state_mean, state_sd, df) else: # print "it's non-normal" stat_list = [] df.apply(lambda row: go_nonpar(row['county'].values[0], row['state'].values[0], stat_list, state_mean, winner), axis=1) df["pval"] = stat_list else: # print "it's normal" z_test(state_mean, state_sd, df) alpha = 0.05 df["reject_naive"] = 1*(df["pval"] < alpha) try: df["reject_bc"] = 1*(df["pval"] < alpha / len(df)) is_reject, corrected_pvals, _, _ = multipletests(df["pval"], alpha=0.1, method='fdr_bh') df["reject_fdr"] = 1*is_reject df["pval_fdr"] = corrected_pvals except: pass return df
def test_lee_et_al(n=300, p=100, s=10, signal=3.5, rho=0., sigma=1., cross_validation=True, condition_on_CVR=False, lam_frac=0.6, glmnet=True, X=None, check_screen=True, intervals=False): print(n, p, s) if X is None: X, y, beta, truth, sigma = gaussian_instance(n=n, p=p, s=s, signal=signal, sigma=sigma, scale=True, center=True) else: beta = np.zeros(p) beta[:s] = signal y = X.dot(beta) + np.random.standard_normal(n) * sigma truth = np.nonzero(beta != 0)[0] if cross_validation: cv = CV_view(rr.glm.gaussian(X, y), loss_label="gaussian", lasso_randomization=None, epsilon=None, scale1=None, scale2=None) # views.append(cv) cv.solve(glmnet=glmnet and have_glmnet) lam = cv.lam_CVR print("minimizer of CVR", lam) if condition_on_CVR: cv.condition_on_opt_state() lam = np.true_divide(lam + cv.one_SD_rule(direction="up"), 2) #lam = cv.one_SD_rule(direction="up") print("one SD rule lambda", lam) else: lam = lam_frac * np.fabs( X.T.dot(np.random.normal(1, 1. / 2, (n, 1000)))).max() L = lasso.gaussian(X, y, lam, sigma=sigma) soln = L.fit() active = soln != 0 nactive = active.sum() print("nactive", nactive) if nactive == 0: return None active_signs = np.sign(soln[active]) if (check_screen == False) or (set(truth).issubset(np.nonzero(active)[0])): active_set = np.nonzero(active)[0] print("active set", active_set) true_vec = beta[active] active_var = np.zeros(nactive, np.bool) # Lee et al. using sigma pvalues = np.zeros(nactive) sel_length = np.zeros(nactive) sel_covered = np.zeros(nactive) naive_pvalues = np.zeros(nactive) naive_length = np.zeros(nactive) naive_covered = np.zeros(nactive) C = L.constraints if C is not None: one_step = L.onestep_estimator for i in range(one_step.shape[0]): eta = np.zeros_like(one_step) eta[i] = active_signs[i] alpha = 0.1 def naive_inference(): obs = (eta * one_step).sum() sd = np.sqrt(np.dot(eta.T, C.covariance.dot(eta))) Z = obs / sd # use Phi truncated to [-5,5] _pval = ndist.cdf(obs / sigma) _pval = 2 * min(_pval, 1 - _pval) _interval = (obs - ndist.ppf(1 - alpha / 2) * sd, obs + ndist.ppf(1 - alpha / 2) * sd) return _pval, _interval if C.linear_part.shape[0] > 0: # there were some constraints L, Z, U, S = C.bounds(eta, one_step) _pval = pivot(L, Z, U, S) # two-sided _pval = 2 * min(_pval, 1 - _pval) if intervals == True: if _pval < 10**(-8): return None L, Z, U, S = C.bounds(eta, one_step) _interval = equal_tailed_interval(L, Z, U, S, alpha=alpha) _interval = sorted([ _interval[0] * active_signs[i], _interval[1] * active_signs[i] ]) else: obs = (eta * one_step).sum() ## jelena: should be this sd = np.sqrt(np.dot(eta.T, C.covariance.dot(eta))), no? sd = np.sqrt((eta * C.covariance.dot(eta))) Z = obs / sd _pval = 2 * (ndist.sf(min(np.fabs(Z))) - ndist.sf(5)) / (ndist.cdf(5) - ndist.cdf(-5)) if intervals == True: _interval = (obs - ndist.ppf(1 - alpha / 2) * sd, obs + ndist.ppf(1 - alpha / 2) * sd) pvalues[i] = _pval naive_pvalues[i], _naive_interval = naive_inference() #print(_naive_interval) def coverage(LU): L, U = LU[0], LU[1] _length = U - L _covered = 0 if (L <= true_vec[i]) and (U >= true_vec[i]): _covered = 1 return _covered, _length if intervals == True: sel_covered[i], sel_length[i] = coverage(_interval) naive_covered[i], naive_length[i] = coverage( _naive_interval) active_var[i] = active_set[i] in truth else: return None print(pvalues) q = 0.2 BH_desicions = multipletests(pvalues, alpha=q, method="fdr_bh")[0] return pvalues, sel_covered, sel_length, \ naive_pvalues, naive_covered, naive_length, active_var, BH_desicions
def get_adjusted_pvals(df): pvals = df['P-value'].tolist() adpvals = multipletests(pvals, 0.1, method='fdr_bh') df['Adjusted P-value'] = adpvals[1] return df
def FunctionalExamination(client, SL_or_SDL, database, input_genes, percentile_threshold, cn_threshold, adj_method, fdr_level, tissues, input_mutations=None): ''' Description: Gene expression, Copy Number Alteration (CNA), Somatic Mutations (optional) are used to decide whether gene is inactive. The SL/SDL pair detection according to difference in gene effect/dependency score given one gene is inactive vs not-inactive Inputs: client:BigQueryClient, the BigQuery client that will run the function. SL_or_SDL:string, Synthetic lethal or Synthetic Dosage Lethal, valid values: 'SL', 'SDL' database: string, The dataresource the analysis will be performed on, valid values: "CRISPR", "shRNA" input_genes:list of strings, the list of genes whose SL/SDL partners will be seeked percentile_threshold:double, the threshold for gene expression (for deciding whether a gene is inactive) cn_threshold:double, the threshold for copy number alteration (for deciding whether a gene is inactive) adj_method: string, optional, p value correction method, valid_values:bonferroni, sidak, holm-sidak , holm, simes-hochberg , hommel, fdr_bh, fdr_by , fdr_tsbh, fdr_tsbky fdr_level:string, the data that will be considered wile doing p value adjustment, valid values : "gene_level", "analysis_level" tissues: list of strings, the tissues that the analysis will be performed on. input_mutations:list of strings, optional, valid values: "Missense_Mutation", "Nonsense_Mutation","Translation_Start_Site", "Frame_Shift_Ins", "Splice_Site", "In_Frame_Del","Frame_Shift_Del", "Nonstop_Mutation", "In_Frame_Ins" Output: A dataframe of SL/SDL pairs ''' if database == 'CRISPR': dep_score_table = 'isb-cgc-bq.DEPMAP.Achilles_gene_effect_DepMapPublic_current' sample_id = 'DepMap_ID' gene_exp = 'TPM' effect = 'Gene_Effect' symbol = 'Hugo_Symbol' selected_samples = RetrieveSamples(client, 'CRISPR', 'func_ex', tissues) ccle_samples = selected_samples ccle_sample_id = 'DepMap_ID' cid = "DepMap_ID" elif database == 'shRNA': dep_score_table = 'isb-cgc-bq.DEPMAP.Combined_gene_dep_score_DEMETER2_current' sample_id = 'CCLE_ID' gene_exp = 'TPM' effect = 'Combined_Gene_Dep_Score' symbol = 'Hugo_Symbol' selected_samples = RetrieveSamples(client, 'shRNA', 'func_ex', tissues) ccle_samples = selected_samples['DepMap_ID'] shRNA_samples = selected_samples['CCLE_Name'] ccle_sample_id = 'DepMap_ID' cid = "CCLE_Name" else: print("The database name can be either CRISPR or shRNA") return () mutation_table = 'isb-cgc-bq.DEPMAP.CCLE_mutation_DepMapPublic_current' gene_exp_table = 'isb-cgc-bq.DEPMAP.CCLE_gene_expression_DepMapPublic_current' cn_table = 'isb-cgc-bq.DEPMAP.CCLE_gene_cn_DepMapPublic_current' sample_info_table = 'isb-cgc-bq.synthetic_lethality.sample_info_TCGAlabels_DepMapPublic_20Q3' cn_threshold = np.log2(2**(cn_threshold) + 1) gene_mapping = ProcessGeneAlias(client, input_genes, 'DepMap') min_sample_size = 20 if len(selected_samples) < (min_sample_size + 1): print("Sample size needs to be greater than " + str(min_sample_size) + ", it is " + str(len(selected_samples))) return () sql_without_mutation = """ WITH table1 AS ( (SELECT symbol, Barcode FROM (SELECT GE.__SYMBOL__ AS symbol, GE.__CCLE_SAMPLE_ID__ AS Barcode , PERCENT_RANK () over (partition by __SYMBOL__ order by __GENE_EXPRESSION__ asc) AS Percentile FROM __GENE_EXP_TABLE__ GE WHERE GE.__SYMBOL__ in (__GENELIST__) AND __CCLE_SAMPLE_ID__ in (__SAMPLE_LIST_CCLE__) AND __GENE_EXPRESSION__ is not null ) AS NGE WHERE NGE.Percentile __GENE_CMP_STR__ INTERSECT DISTINCT SELECT symbol, Barcode FROM (SELECT CN.__SYMBOL__ AS symbol, CN.__CCLE_SAMPLE_ID__ AS Barcode, CN.CNA AS NORM_CN FROM __CN_TABLE__ CN WHERE CN.__SYMBOL__ in (__GENELIST__) AND __CCLE_SAMPLE_ID__ in (__SAMPLE_LIST_CCLE__) and CN.CNA is not null) AS NC WHERE NC.NORM_CN __CN_CMP_STR__ )""" sql_mutation_part = """ UNION DISTINCT SELECT M.__SYMBOL__ AS symbol , M.__CCLE_SAMPLE_ID__ AS Barcode FROM __MUTATION_TABLE__ M WHERE __SYMBOL__ IN (__GENELIST__) AND M.Variant_Classification IN (__MUTATIONLIST__) AND __CCLE_SAMPLE_ID__ in (__SAMPLE_LIST_CCLE__))""" rest_of_the_query = """ , table2 AS ( SELECT S.DepMap_ID Barcode, __SYMBOL__ symbol, (RANK() OVER (PARTITION BY __SYMBOL__ ORDER BY __EFFECT__ ASC)) + (COUNT(*) OVER ( PARTITION BY __SYMBOL__, CAST(__EFFECT__ as STRING)) - 1)/2.0 AS rnkdata FROM __ACHILLES_TABLE__ A, __SAMPLE_INFO_TABLE__ S where __SYMBOL__ IS NOT NULL AND __EFFECT__ IS NOT NULL AND S.__REL_SAMPLE_ID__=A.__SAMPLE_ID__ AND S.DepMap_ID in (__SAMPLE_LIST_CCLE__) ), summ_table AS ( SELECT n1.symbol as symbol1, n2.symbol as symbol2, COUNT( n1.Barcode) as n_1, SUM( n2.rnkdata ) as sumx_1, FROM table1 AS n1 INNER JOIN table2 AS n2 ON n1.Barcode = n2.Barcode GROUP BY symbol1, symbol2 ), statistics AS ( SELECT symbol1, symbol2, n1, n, U1, (U1 - n1n2/2.0)/den as zscore FROM ( SELECT symbol1, symbol2, n_t as n, n_1 as n1, sumx_1 - n_1 *(n_1 + 1) / 2.0 as U1, n_1 * (n_t - n_1 ) as n1n2, SQRT( n_1 * (n_t - n_1 )*(n_t + 1) / 12.0 ) as den FROM summ_table as t1 LEFT JOIN ( SELECT symbol, COUNT( Barcode ) as n_t FROM table2 GROUP BY symbol) t2 ON symbol2 = symbol WHERE n_t > 20 and n_1>5 ) WHERE den > 0 ) SELECT symbol1, symbol2, n1, n, U1, `cgc-05-0042.functions.jstat_normal_cdf`(zscore, 0.0, 1.0 ) as pvalue FROM statistics GROUP BY 1,2,3,4,5,6 #HAVING pvalue <= 0.01 ORDER BY pvalue ASC """ genes_intermediate_representation = [ "'" + str(x) + "'" for x in input_genes ] input_genes_query = ','.join(genes_intermediate_representation) included_samples = ["'" + str(x) + "'" for x in selected_samples] included_samples = ','.join(included_samples) included_samples_ccle = ["'" + str(x) + "'" for x in ccle_samples] included_samples_ccle = ','.join(included_samples_ccle) if SL_or_SDL == 'SDL' or input_mutations is None: sql_func_ex = sql_without_mutation + ')' + ' ' + rest_of_the_query else: mutations_intermediate_representation = [ "'" + x + "'" for x in input_mutations ] input_mutations_for_query = ','.join( mutations_intermediate_representation) sql_func_ex = sql_without_mutation + ' ' + sql_mutation_part + ' ' + rest_of_the_query sql_func_ex = sql_func_ex.replace('__MUTATION_TABLE__', mutation_table) sql_func_ex = sql_func_ex.replace('__MUTATIONLIST__', input_mutations_for_query) sql_func_ex = sql_func_ex.replace('__GENELIST__', input_genes_query) sql_func_ex = sql_func_ex.replace('__CUTOFFPRC__', str(percentile_threshold / 100)) sql_func_ex = sql_func_ex.replace('__CUTOFFSCNA__', str(cn_threshold)) sql_func_ex = sql_func_ex.replace('__CN_TABLE__', cn_table) sql_func_ex = sql_func_ex.replace('__GENE_EXP_TABLE__', gene_exp_table) sql_func_ex = sql_func_ex.replace('__SAMPLE_ID__', sample_id) sql_func_ex = sql_func_ex.replace('__SYMBOL__', symbol) sql_func_ex = sql_func_ex.replace('__ACHILLES_TABLE__', dep_score_table) sql_func_ex = sql_func_ex.replace('__GENE_EXPRESSION__', gene_exp) sql_func_ex = sql_func_ex.replace('__EFFECT__', effect) sql_func_ex = sql_func_ex.replace('__SAMPLE_LIST__', included_samples) sql_func_ex = sql_func_ex.replace('__SAMPLE_LIST_CCLE__', included_samples_ccle) sql_func_ex = sql_func_ex.replace('__SAMPLE_INFO_TABLE__', sample_info_table) sql_func_ex = sql_func_ex.replace('__CCLE_SAMPLE_ID__', ccle_sample_id) sql_func_ex = sql_func_ex.replace('__REL_SAMPLE_ID__', cid) if SL_or_SDL == "SL": comp_str = "<" + str(cn_threshold) com_gene_th = "<" + str(percentile_threshold / 100) elif SL_or_SDL == "SDL": comp_str = ">" + str(cn_threshold) com_gene_th = ">" + str(percentile_threshold / 100) sql_func_ex = sql_func_ex.replace('__CN_CMP_STR__', comp_str) sql_func_ex = sql_func_ex.replace('__GENE_CMP_STR__', com_gene_th) results = client.query(sql_func_ex).result().to_dataframe() if results.shape[0] < 1: print("Functional examimation inference procedure applied on " + database + " did not find candidate " + SL_or_SDL + " pairs.") return (results) report = results[['symbol1', 'symbol2', 'n1', 'n', 'pvalue']] report = report.dropna() report.columns = [ 'InactiveDB', 'SL_Candidate', '#InactiveSamples', '#Samples', 'PValue' ] report['Inactive'] = report['InactiveDB'].map(gene_mapping) if fdr_level == "gene_level": inactive_genes = list(report["Inactive"].unique()) for i in range(len(inactive_genes)): report.loc[report["Inactive"] == inactive_genes[i], 'FDR'] = multipletests( report.loc[report["Inactive"] == inactive_genes[i], 'PValue'], method=adj_method, is_sorted=False)[1] elif fdr_level == "analysis_level": FDR = multipletests(report['PValue'], method=adj_method, is_sorted=False)[1] report['FDR'] = FDR else: print("FDR level can be either gene_level or analysis_level") return () report['Tissue'] = str(tissues) cols = [ 'Inactive', 'InactiveDB', 'SL_Candidate', '#InactiveSamples', '#Samples', 'PValue', 'FDR', 'Tissue' ] report = report[cols] if SL_or_SDL == "SDL": report.columns = [ 'Overactive', 'OveractiveDB', 'SL_Candidate', '#Overactive', '#Samples', 'PValue', 'FDR', 'Tissue' ] return report
def run(de, all, organism='hsa', nB=2000, beta=None, combine='fisher'): for x in IdMapping.SPECIES: if organism in x: organism = x[3] break else: raise Exception("Unknown organism") if organism not in ['hsa', 'mmu']: raise Exception("The organism not contained in the prepared data") if type(de) == str: de, all = SPIA._load_de(de, all) else: de = {int(k): float(v) for k, v in de.items()} all = [int(x) for x in all] datpT_ALL, id2name = SPIA.load_json_data(organism) rel = [ "activation", "compound", "binding/association", "expression", "inhibition", "activation_phosphorylation", "phosphorylation", "inhibition_phosphorylation", "inhibition_dephosphorylation", "dissociation", "dephosphorylation", "activation_dephosphorylation", "state change", "activation_indirect effect", "inhibition_ubiquination", "ubiquination", "expression_indirect effect", "inhibition_indirect effect", "repression", "dissociation_phosphorylation", "indirect effect_phosphorylation", "activation_binding/association", "indirect effect", "activation_compound", "activation_ubiquination" ] inter_value = [ 1, 0, 0, 1, -1, 1, 0, -1, -1, 0, 0, 1, 0, 1, -1, 0, 1, -1, -1, 0, 0, 1, 0, 1, 1 ] or beta rel_dict = {rel[i]: inter_value[i] for i in range(len(rel))} datp_ALL = {} for k, v in datpT_ALL.items(): sizem = len(v[rel[0]][0]) s, con = np.zeros((sizem, sizem)), np.zeros((sizem, sizem)) for kk, vv in rel_dict.items(): con += v[kk] * abs(vv) s += v[kk] * vv zz = np.reshape(np.repeat(con.sum(axis=0), sizem), (sizem, sizem)) z = np.transpose(zz) z[z == 0] = -1 r = np.divide(s, z) datp_ALL[k] = r smPFS, tAraw, tA, pNDE, pb, pG, status = {}, {}, {}, {}, {}, {}, {} # calculate the Ac for k, v in datp_ALL.items(): row_names = datpT_ALL[k]['row_names'] # let first calculate the pNDE noMy = len(set(row_names) & set(de.keys())) pNDE[k] = stats.hypergeom.sf(noMy - 1, len(all), len(set(row_names) & set(all)), len(de)) # then calculate the Ac and pPERT M = np.eye(v.shape[0]) * -1 + v if np.linalg.det(M) == 0: smPFS[k], tAraw[k], tA[k], pb[ k] = np.nan, np.nan, np.nan, np.nan continue X = [] for x in row_names: if x in de: X.append(de[x]) else: X.append(0) pfs = np.linalg.solve(M, -np.array(X)) smPFS[k] = sum(pfs - X) tAraw[k] = smPFS[k] pfstmp = [] de_sample = list(de.values()) all_sample = [i for i, x in enumerate(row_names) if x in all] length = len(X) for i in range(nB): # nB x = np.zeros(length) sp = random.sample(de_sample, noMy) idx = random.sample(all_sample, noMy) x[idx] = sp tt = np.linalg.solve(M, -x) pfstmp.append(sum(tt - x)) tA[k] = tAraw[k] - np.median(np.array(pfstmp)) if tA[k] > 0: status[k] = "Activated" else: status[k] = "Inhibited" ob = tA[k] pfstmp = np.array(pfstmp) - np.median(np.array(pfstmp)) if ob > 0: pb[k] = sum([1 for pf in pfstmp if pf >= ob]) / len(pfstmp) * 2 if pb[k] <= 0: pb[k] = 1 / nB / 100 elif pb[k] >= 1: pb[k] = 1 elif ob < 0: pb[k] = sum([1 for pf in pfstmp if pf <= ob]) / len(pfstmp) * 2 if pb[k] <= 0: pb[k] = 1 / nB / 100 elif pb[k] >= 1: pb[k] = 1 else: pb[k] = 1 if combine == 'fisher': c = pNDE[k] * pb[k] pG[k] = c - c * math.log(c) else: # comb = pnorm((qnorm(p1) + qnorm(p2)) / sqrt(2)) pG[k] = norm.cdf( norm.ppf(pNDE[k]) + norm.ppf(pb[k]) / math.sqrt(2)) # print('id: ', k, '\ttA:', tA[k], '\tpNDE: ', pNDE[k], '\t pPERT: ', pb[k], '\tPG: ', pG[k]) _, o, _, _ = multicomp.multipletests(list(pG.values()), method='fdr_bh') pGfdr = {list(pG.keys())[i]: o[i] for i in range(len(list(pG.keys())))} _, o, _, _ = multicomp.multipletests(list(pNDE.values()), method='fdr_bh') pNDEfdr = { list(pNDE.keys())[i]: o[i] for i in range(len(list(pNDE.keys()))) } _, o, _, _ = multicomp.multipletests(list(pG.values()), method='bonferroni') pGbf = {list(pG.keys())[i]: o[i] for i in range(len(list(pG.keys())))} df = pd.DataFrame([id2name, pNDE, pb, pG, pGfdr, pGbf, status]).T df.columns = [ 'name', 'pNDE', 'pPERT', 'pG', 'pGfdr', 'pGFWER', 'status' ] df = df.sort_values(by='pGFWER') return SPIA(df, de, all, organism, nB, beta, combine)
pvals = np.zeros((mdl['X'].shape[1], len(pairs))) # save mean connectivity for each group pair, and their differences output_list = [] for j, p in enumerate(pairs): t1 = int(p.split(',')[0]) t2 = int(p.split(',')[1]) for k in np.arange(mdl['X'].shape[1]): #pvals[k, j] = kruskalwallis(group_data[t1][:, k], group_data[t2][:, k]).pvalue pvals[k, j] = ttest_ind(group_data[t1][:, k], group_data[t2][:, k])[1] # binarize with fdr correction or a uncorrected threshold if fdr: corrected = multipletests(np.ravel(pvals[:, j]), alpha=0.05, method='fdr_bh') passed = corrected[0] pvals_corrected = corrected[1] else: threshold = 1 passed = pvals[:, j] < threshold # skip comparisons with no significant contrasts if len(passed) == 0: print('SKIPPING DUE TO NO SIG. DIFFERENCES') continue try: threshold = np.max(pvals[passed]) print(
df = elem_df.merge(barc_df.drop(samp_drop_cols, axis=1), on=["unique_id", "element"], how="left") df = df.drop_duplicates() all_grp_dfs.append(df) # In[27]: # correct p values all_corr_dfs = [] for df in all_grp_dfs: pval_cols = [x for x in df.columns if "_pval" in x] for col in pval_cols: sub_df = df[~pd.isnull(df[col])][["unique_id", "element", col]] new_pvals = multicomp.multipletests(sub_df[col], method="bonferroni")[1] padj_col = "rna_%s_padj" % (col.split("_")[1]) sub_df[padj_col] = new_pvals sub_df.drop(col, axis=1, inplace=True) df = df.merge(sub_df, on=["unique_id", "element"], how="left") all_corr_dfs.append(df) # ## 4. use stouffer's method to combine p-values across replicates # in this case, combine the *uncorrected* pvalues and *then adjust* using stouffer's method # In[28]: all_names = [ "POOL1__pMPRA1__HeLa", "POOL1__pMPRA1__HepG2", "POOL1__pMPRA1__K562", "POOL1__pNoCMVMPRA1__HeLa", "POOL1__pNoCMVMPRA1__HepG2",
def p_adj_bh(x): '''Adjust p values using Benjamini/Hochberg method''' return multipletests(x, method='fdr_bh', returnsorted = False)[1]
def pcaller(M, cM, biases, IR, chromLen, Diags, cDiags, num, pw=2, ww=5, sig=0.05, maxww=20, maxapart=2000000, res=10000): # Necessary Modules from scipy.stats import poisson from statsmodels.sandbox.stats.multicomp import multipletests logger = logging.getLogger() extDiags = {} for w in range(ww, maxww + 1): temp = [] for i in xrange(num): OneDArray = Diags[i] extODA = np.zeros(chromLen - i + w * 2) extODA[w:-w] = OneDArray temp.append(extODA) extDiags[w] = temp x = np.arange(ww, num) predictE = IR.predict(x) predictE[predictE < 0] = 0 EDiags = [] for i in xrange(x.size): OneDArray = np.ones(chromLen - x[i]) * predictE[i] EDiags.append(OneDArray) EM = sparse.diags(EDiags, x, format='csr') extCDiags = {} extEDiags = {} for w in range(ww, maxww + 1): tempC = [] tempE = [] for i in xrange(x.size): extODA_E = np.zeros(chromLen - x[i] + w * 2) extODA_E[w:-w] = EDiags[i] tempE.append(extODA_E) extODA_C = np.zeros(chromLen - x[i] + w * 2) extODA_C[w:-w] = cDiags[i] tempC.append(extODA_C) extCDiags[w] = tempC extEDiags[w] = tempE ps = 2 * pw + 1 # Peak Size Pool_Diags = {} Pool_EDiags = {} Pool_cDiags = {} Offsets_Diags = {} Offsets_EDiags = {} for w in range(ww, maxww + 1): ws = 2 * w + 1 # Window size ss = range(ws) Pool_Diags[w] = {} Pool_EDiags[w] = {} Pool_cDiags[w] = {} Offsets_Diags[w] = {} Offsets_EDiags[w] = {} for i in ss: for j in ss: Pool_Diags[w][(i, j)] = [] Pool_EDiags[w][(i, j)] = [] Pool_cDiags[w][(i, j)] = [] Offsets_Diags[w][(i, j)] = np.arange(num) + (i - j) Offsets_EDiags[w][(i, j)] = x + (i - j) for oi in np.arange(num): if Offsets_Diags[w][(i, j)][oi] >= 0: starti = i endi = i + chromLen - Offsets_Diags[w][(i, j)][oi] else: starti = i - Offsets_Diags[w][(i, j)][oi] endi = starti + chromLen + Offsets_Diags[w][(i, j)][oi] Pool_Diags[w][(i, j)].append(extDiags[w][oi][starti:endi]) for oi in xrange(x.size): if Offsets_EDiags[w][(i, j)][oi] >= 0: starti = i endi = i + chromLen - Offsets_EDiags[w][(i, j)][oi] else: starti = i - Offsets_EDiags[w][(i, j)][oi] endi = starti + chromLen + Offsets_EDiags[w][(i, j)][oi] Pool_EDiags[w][(i, j)].append(extEDiags[w][oi][starti:endi]) Pool_cDiags[w][(i, j)].append(extCDiags[w][oi][starti:endi]) ## Peak Calling ... xi, yi = M.nonzero() Mask = ((yi - xi) >= ww) & ((yi - xi) <= (maxapart // res)) xi = xi[Mask] yi = yi[Mask] bSV = np.zeros(xi.size) bEV = np.zeros(xi.size) logger.info('Observed Contact Number: %d', xi.size) RefIdx = np.arange(xi.size) RefMask = np.ones_like(xi, dtype=bool) iniNum = xi.size logger.info('Calculate the expected matrix ...') for w in range(ww, maxww + 1): ws = 2 * w + 1 bS = sparse.csr_matrix((chromLen, chromLen)) bE = sparse.csr_matrix((chromLen, chromLen)) Reads = sparse.csr_matrix((chromLen, chromLen)) logger.info(' Current window width: %s' % w) P1 = set([(i, j) for i in range(w - pw, ps + w - pw) for j in range(w - pw, ps + w - pw)]) P_1 = set([(i, j) for i in range(w + 1, ws) for j in range(w)]) P_2 = set([(i, j) for i in range(w + 1, ps + w - pw) for j in range(w - pw, w)]) P2 = P_1 - P_2 for key in Pool_Diags[w]: if (key[0] != w) and (key[1] != w) and (key not in P1): bS = bS + sparse.diags( Pool_cDiags[w][key], Offsets_EDiags[w][key], format='csr') bE = bE + sparse.diags( Pool_EDiags[w][key], Offsets_EDiags[w][key], format='csr') if key in P2: Reads = Reads + sparse.diags( Pool_Diags[w][key], Offsets_Diags[w][key], format='csr') Txi = xi[RefIdx] Tyi = yi[RefIdx] RNums = np.array(Reads[Txi, Tyi]).ravel() EIdx = RefIdx[RNums >= 16] logger.info(' Valid Contact Number: %d', EIdx.size) Valid_Ratio = EIdx.size / float(iniNum) logger.info(' Valid Contact Ratio: %.3f', Valid_Ratio) Exi = xi[EIdx] Eyi = yi[EIdx] bSV[EIdx] = np.array(bS[Exi, Eyi]).ravel() bEV[EIdx] = np.array(bE[Exi, Eyi]).ravel() RefIdx = RefIdx[RNums < 16] iniNum = RefIdx.size if Valid_Ratio < 0.1: logger.info( ' Ratio of valid contact is too small, break the loop ...') break logger.info(' Continue ...') logger.info(' %d Contacts will get into next loop ...', RefIdx.size) RefMask[RefIdx] = False Mask = np.logical_and((bEV != 0), RefMask) xi = xi[Mask] yi = yi[Mask] bRV = bSV[Mask] / bEV[Mask] bR = sparse.csr_matrix((chromLen, chromLen)) bR[xi, yi] = bRV ## Corrected Expected Matrix cEM = EM.multiply(bR) logger.info('Construct Poisson Models ...') ## Poisson Models xi, yi = cEM.nonzero() Evalues = np.array(cEM[xi, yi]).ravel() * biases[xi] * biases[yi] Mask = (Evalues > 0) Evalues = Evalues[Mask] xi = xi[Mask] yi = yi[Mask] Poisses = poisson(Evalues) logger.info('Number of Poisson Models: %d', Evalues.size) logger.info('Assign a p-value for each Observed Contact Frequency ...') Ovalues = np.array(M[xi, yi]).ravel() pvalues = 1 - Poisses.cdf(Ovalues) Fold = Ovalues / Evalues # Multiple Tests logger.info('Benjamini-Hochberg correcting for multiple tests ...') cResults = multipletests(pvalues, alpha=sig, method='fdr_bh') reject = cResults[0] cP = cResults[1] # Corrected Pvalue xpos = xi[reject] ypos = yi[reject] pvalues = pvalues[reject] qvalues = cP[reject] Ovalues = Ovalues[reject] Fold = Fold[reject] # Remove Gap Effect logger.info('Remove Gap Effects ...') gaps = set(np.where(np.array(M.sum(axis=1)).ravel() == 0)[0]) if len(gaps) > 0: fIdx = [] for i in xrange(xpos.size): lower = (xpos[i] - 5) if (xpos[i] > 5) else 0 upper = (xpos[i] + 5) if ((xpos[i] + 5) < chromLen) else (chromLen - 1) cregion_1 = range(lower, upper) lower = (ypos[i] - 5) if (ypos[i] > 5) else 0 upper = (ypos[i] + 5) if ((ypos[i] + 5) < chromLen) else (chromLen - 1) cregion_2 = range(lower, upper) cregion = set(cregion_1) | set(cregion_2) intersect = cregion & gaps if len(intersect) == 0: fIdx.append(i) xpos = xpos[fIdx] ypos = ypos[fIdx] pvalues = pvalues[fIdx] qvalues = qvalues[fIdx] Ovalues = Ovalues[fIdx] Fold = Fold[fIdx] return xpos, ypos, Ovalues, Fold, pvalues, qvalues
#expression[gene + "_" + sample] = [float(geneExpression[sampleInd]), np.mean(negativeExpr)] expression[gene + "_" + sample] = [ float(geneExpression[sampleInd]), negativeExpr ] cosmicGenePValuesOneSided = np.array(cosmicGenePValuesOneSided, dtype="object") sortedInd = cosmicGenePValuesOneSided[:, 1].argsort() cosmicGenePValuesOneSided = cosmicGenePValuesOneSided[sortedInd] cosmicGeneTStat = np.array(cosmicGeneTStat, dtype="object") cosmicGeneTStat = cosmicGeneTStat[sortedInd] print cosmicGenePValuesOneSided print cosmicGeneTStat reject, pAdjusted, _, _ = multipletests( cosmicGenePValuesOneSided[:, 1], method='bonferroni') #fdr_bh or bonferroni import matplotlib.pyplot as plt print "Significant COSMIC genes after bonferroni and 1-sided: " filteredPValues = [] signGenes = [] for pValueInd in range(0, len(cosmicGenePValuesOneSided)): if reject[pValueInd] == True and np.sign(cosmicGeneTStat[pValueInd, 1]) == 1: pValue = pAdjusted[pValueInd] filteredPValues.append( [cosmicGenePValuesOneSided[pValueInd, 0], pValue])
import pandas as pd import numpy as np from statsmodels.sandbox.stats.multicomp import multipletests df = pd.read_csv('timeseries_significance_qvalues.csv',sep=",") corrected = multipletests(df['p_values'].values,alpha=0.05,method='bonferroni')[1] df['bh_corrected'] = corrected df.to_csv('corrected.txt',index=False)
def get_test(self): """ :param model_type: for which we want to extract :return: """ print(f"Calculating sign test for subset: {self.subset_type}") # Load all models predictions for each phase for phase in self.phase_list: phase_all_models_ndcg_list = [] # ndcg list model_type_list = [] # Name of the model for model_type in self.models_list: ndcg_path = self._get_ndcg_path(self.model_preds_root, model_type, phase=phase, subset_type=self.subset_type) ndcg_list = self.read_file_as_list(ndcg_path) print(f"Total samples {model_type}: {len(ndcg_list)}") phase_all_models_ndcg_list.append(ndcg_list) model_type_list.append(model_type) # We form combinations of all indices index_models_list = list(range(len(model_type_list))) # pairwise combination_set = combinations(index_models_list, 2) for combination_indices in combination_set: model1_preds = phase_all_models_ndcg_list[ combination_indices[0]] model2_preds = phase_all_models_ndcg_list[ combination_indices[1]] model1_name = model_type_list[combination_indices[0]] model2_name = model_type_list[combination_indices[1]] stat, p = mannwhitneyu(model1_preds, model2_preds) print( f'Mannwhitneyu - For phase: {phase} - models: {model1_name} vs' f' {model2_name} : stat={stat:.4f}, p={p:.4f}') stat, p = wilcoxon(model1_preds, model2_preds) print( f'Wilcoxon - For phase: {phase} - models: {model1_name} vs' f' {model2_name} : stat={stat:.4f}, p={p:.4f}') # Checking for equivalence of *args # stat, p = f_oneway(phase_all_models_ndcg_list[0], # phase_all_models_ndcg_list[1], phase_all_models_ndcg_list[2], # phase_all_models_ndcg_list[3]) # stat, p = f_oneway(*phase_all_models_ndcg_list) # stat, p = mannwhitneyu(*phase_all_models_ndcg_list) # stat, p = wilcoxon(*phase_all_models_ndcg_list) stat, p = kruskal(*phase_all_models_ndcg_list) print(f'Kruskal - For phase: {phase}: stat={stat:.4f}, p={p:.4f}') bonferroni_correction = multipletests(p, method='bonferroni') # print(bonferroni_correction) # (reject, pvals_corrected, alphacSidak, alphacBonf) action = str(bonferroni_correction[0][0]) # np array new_p_value = bonferroni_correction[1][0] print( f'Kruskal - bonferroni - For phase: {phase}: p={new_p_value:.4f}, ' f'action: {str(action)}') stat, p = friedmanchisquare(*phase_all_models_ndcg_list) print( f'Friedmanchisquare - For phase: {phase}: stat={stat:.4f}, p={p:.4f}' )
def modbindevalscorer(modules, binding): modules = modules.filter_size(5) if len(modules) == 0: aucodds = 0 odds = pd.DataFrame() pvals = pd.DataFrame() qvals = pd.DataFrame() else: modmem = modules.cal_membership(G=binding.index) binmem = binding modsizes = modmem.sum() binsizes = binmem.sum() tps = modmem.T.dot(binmem.astype(np.int)) fps = binsizes - tps fns = (modsizes - tps.T).T tns = binmem.shape[0] - tps - fps - fns odds = ((tps * tns) / (fps * fns)) values = np.array([ odds.values.flatten(), tps.values.flatten(), fps.values.flatten(), fns.values.flatten(), tns.values.flatten() ]) pvals = np.apply_along_axis(filterfisher, 0, values) qvals = [] for pvalrow in pvals.reshape(tps.shape): _, qvalrow, _, _ = np.array(multipletests(pvalrow), dtype=object) qvals.append(qvalrow) qvals = pd.DataFrame(qvals, index=tps.index, columns=tps.columns) pvals = pd.DataFrame(pvals.reshape(tps.shape), index=tps.index, columns=tps.columns) if binding.columns.nlevels > 1: pvals = pvals.T.groupby(level=0).min() qvals = qvals.T.groupby(level=0).min() # group by regulator odds = odds.T.groupby(level=0).max() # group by regulator else: pvals = pvals.T qvals = qvals.T odds = odds.T ## auc odds odds_filtered = odds.copy() odds_filtered.values[(qvals > 0.05).values.astype(np.bool)] = 0 odds_max = odds_filtered.max(1) if len(odds_max) == 0: aucodds = 0 else: cutoffs = np.linspace(0, 3, 100) stillenriched = [ (np.log10(odds_max) >= cutoff).sum() / len(odds_max) for cutoff in cutoffs ] aucodds = np.trapz(stillenriched, cutoffs) / (cutoffs[-1] - cutoffs[0]) scores = {"aucodds": aucodds} return scores
for dataset in datasetids: print(dataset), ## Read dataset df, meta = read_dataset_files(dataset, datadir) for metric in ['shannon', 'chao1', 'simpson']: alpha = make_alpha_df(df, meta, dataset, metric) alphas.append(alpha) alphasdf = pd.concat(alphas, ignore_index=True) alphasdf.to_csv(args.alphas_out, sep='\t', index=False) # Because I'm using the entire OTU table, some of these samples don't have disease metadata. # I don't want to compare "NaN" labeled samples with anything because they mean nothing. alphasdf = alphasdf.query('DiseaseState != " "').dropna( subset=['DiseaseState']) pvals = [] for g, subdf in alphasdf.groupby('alpha_metric'): pval = get_layered_pvals(subdf, 'DiseaseState', 'alpha', 'study') pval = pd.DataFrame.from_dict(pval).stack().reset_index() pval.columns = ['comparison', 'study', 'p'] pval['q'] = multipletests(pval['p'])[1] pval['alpha_metric'] = g pvals.append(pval) pvalsdf = pd.concat(pvals) pvalsdf.to_csv(args.pvals_out, sep='\t', index=False)
def anova_modt(df, columns, design): """ Runs ANOVA on the subset of df defined by columns with the specified design matrix, using the moderated T linear model. Args: df (Pandas DataFrame): DataFrame with one column per measurement, rows=proteins columns (list(columns)): list of column names in df which have data design (Pandas DataFrame)): design matrix (see limma documentation for details) Note that the columns names of design are the returned coefficient names Returns: (res_df, result) res_df (Pandas DataFrame): DataFrame with one row per protein - Same order as df - One column for each best fit coefficient - Has columns 'F_<COEF>' and 'PVal_<COEF>' for each coefficient result (R object) """ coefs = list(design.columns) data = df[columns] # Note that result is an R object # We can't do much with it directly except call topTable result = r['moderated.t'](data, design=design) # Now obtain best estimates for each coefficient res_coef = pandas2ri.ri2py(r['topTable']( result, number=data.shape[0], sort_by='none')).iloc[:, :len(coefs) + 3] # Adjust overall p-value res_coef['P.Value.Adj'] = multipletests(res_coef['P.Value'], alpha=ALPHA, method='fdr_bh')[1] # F-test for significance for terms OTHER than intercept and PlexB # Do this iteratively and obtain a p-value and F-value for every coefficient coefs.remove('Intercept') # Create mapping of coefficients to F and PVal columns coef_col_map = { c: ['F_%s' % c, 'PVal_%s' % c, 'PVal_%s_Adj' % c] for c in coefs } result_colnames = [col for cols in coef_col_map.values() for col in cols] # Create empty pvalue df res_f = pd.DataFrame(index=np.arange(data.shape[0]), columns=result_colnames, dtype=float) for c in coef_col_map.keys(): # Find F and pvals/adj_pvals for each coefficient F_pv = pandas2ri.ri2py(r['topTable'](result, coef=c, number=data.shape[0], sort_by='none'))[['t', 'P.Value']] _, pv_adj, _, _ = multipletests(F_pv['P.Value'], alpha=ALPHA, method='fdr_bh') res_f[coef_col_map[c]] = np.concatenate( (F_pv.values, pv_adj[:, np.newaxis]), axis=1) # Now bind together everything into one df aux_data = df.drop(columns, axis=1).reset_index(drop=True) data.reset_index(drop=True, inplace=True) res_f.reset_index(drop=True, inplace=True) res_coef.reset_index(drop=True, inplace=True) res_df = pd.concat((data, res_coef, res_f, aux_data), axis=1) return res_df, result
def run_psm(data, comp1, comp2, plex='both'): """ Use PSM and roll up to the level of unique accession_number """ start = time.time() c1, c2 = validate_comp_subset_data(data, comp1, comp2) if plex == 'A' or plex == 'B': # Filter to corresponding plex c1 = c1[[col for col in c1.columns if col[-2] == plex]] c2 = c2[[col for col in c2.columns if col[-2] == plex]] elif plex == 'both': # Do nothing pass else: raise ValueError('Invalid specification of plex') pvals = do_stat_tests_protein(c1, c2, data.accession_number) # Delete pvals which are all NaN, i.e. skipped # Otherwise adjust pvals for c in pvals.columns: if pvals[c].isnull().all(): del pvals[c] elif c == u'fold_change_med': continue # Don't adjust the pval for fold change elif pvals[c].dtype == np.number: # Mask NaN values so we don't bias the adjusted test pv = pvals[c] mask = np.isfinite(pv) pv_corr = np.full(pv.shape, np.nan) pv_corr[mask] = multipletests(pv[mask], alpha=0.05, method='fdr_bh')[1] pvals[c + '_adj'] = pv_corr pvals.rename(columns={ 'protein_id': 'accession_number', 'fold_change_med': 'fold_change' }, inplace=True) # Make auxiliary info aux_info = data[[ 'accession_number', 'geneSymbol', ]] aux_info.drop_duplicates(inplace=True) def get_group_counts(x): return pd.Series({ 'n_pep': len(x), 'n_valid': np.sum(1 - np.isnan(x).values) }) tmp = (pd.concat((c1, c2), axis=1).groupby( data.accession_number).apply(get_group_counts).reset_index()) out = pd.merge(pvals, aux_info, on='accession_number') out = pd.merge(out, tmp, on='accession_number') print time.time() - start return out
def fit(self, ids, ids2=None, voxel_thresh=0.01, q=0.05, corr='FWE', n_iters=5000, prior=0.5, n_cores=4): self.voxel_thresh = voxel_thresh self.corr = corr self.n_iters = n_iters self.ids = ids if ids2 is None: ids2 = list(set(self.coordinates['id'].values) - set(self.ids)) self.ids2 = ids2 all_ids = self.ids + self.ids2 red_coords = self.coordinates.loc[self.coordinates['id'].isin(all_ids)] k_est = self.kernel_estimator(red_coords, self.mask) ma_maps1 = k_est.transform(self.ids, masked=True, **self.kernel_arguments) ma_maps2 = k_est.transform(self.ids2, masked=True, **self.kernel_arguments) # Calculate different count variables eps = np.spacing(1) n_selected = len(self.ids) n_unselected = len(self.ids2) n_mappables = n_selected + n_unselected # Transform MA maps to 1d arrays ma_maps_all = np.vstack((ma_maps1, ma_maps2)) n_selected_active_voxels = np.sum(ma_maps1, axis=0) n_unselected_active_voxels = np.sum(ma_maps2, axis=0) # Nomenclature for variables below: p = probability, # F = feature present, g = given, U = unselected, A = activation. # So, e.g., pAgF = p(A|F) = probability of activation # in a voxel if we know that the feature is present in a study. pF = (n_selected * 1.0) / n_mappables pA = np.array(np.sum(ma_maps_all, axis=0) / n_mappables).squeeze() # Conditional probabilities pAgF = n_selected_active_voxels * 1.0 / n_selected pAgU = n_unselected_active_voxels * 1.0 / n_unselected pFgA = pAgF * pF / pA # Recompute conditionals with uniform prior pAgF_prior = prior * pAgF + (1 - prior) * pAgU pFgA_prior = pAgF * prior / pAgF_prior # One-way chi-square test for consistency of activation pAgF_chi2_vals = one_way(np.squeeze(n_selected_active_voxels), n_selected) pAgF_p_vals = special.chdtrc(1, pAgF_chi2_vals) pAgF_sign = np.sign(n_selected_active_voxels - np.mean(n_selected_active_voxels)) pAgF_z = p_to_z(pAgF_p_vals, tail='two') * pAgF_sign # Two-way chi-square for specificity of activation cells = np.squeeze( np.array([[n_selected_active_voxels, n_unselected_active_voxels], [n_selected - n_selected_active_voxels, n_unselected - n_unselected_active_voxels]]).T) pFgA_chi2_vals = two_way(cells) pFgA_p_vals = special.chdtrc(1, pFgA_chi2_vals) pFgA_p_vals[pFgA_p_vals < 1e-240] = 1e-240 pFgA_sign = np.sign(pAgF - pAgU).ravel() pFgA_z = p_to_z(pFgA_p_vals, tail='two') * pFgA_sign images = { 'pA': pA, 'pAgF': pAgF, 'pFgA': pFgA, ('pAgF_given_pF=%0.2f' % prior): pAgF_prior, ('pFgA_given_pF=%0.2f' % prior): pFgA_prior, 'consistency_z': pAgF_z, 'specificity_z': pFgA_z, 'consistency_chi2': pAgF_chi2_vals, 'specificity_chi2': pFgA_chi2_vals} if corr == 'FWE': iter_dfs = [red_coords.copy()] * n_iters null_ijk = np.vstack(np.where(self.mask.get_data())).T rand_idx = np.random.choice(null_ijk.shape[0], size=(red_coords.shape[0], n_iters)) rand_ijk = null_ijk[rand_idx, :] iter_ijks = np.split(rand_ijk, rand_ijk.shape[1], axis=1) params = zip(iter_dfs, iter_ijks, range(n_iters)) with mp.Pool(n_cores) as p: perm_results = list(tqdm(p.imap(self._perm, params), total=self.n_iters)) pAgF_null_chi2_dist, pFgA_null_chi2_dist = zip(*perm_results) # pAgF_FWE pAgF_null_chi2_dist = np.squeeze(pAgF_null_chi2_dist) np.savetxt('null_dist.txt', pAgF_null_chi2_dist) pAgF_p_FWE = np.empty_like(pAgF_chi2_vals).astype(float) for voxel in range(pFgA_chi2_vals.shape[0]): pAgF_p_FWE[voxel] = null_to_p(pAgF_chi2_vals[voxel], pAgF_null_chi2_dist, tail='upper') # Crop p-values of 0 or 1 to nearest values that won't evaluate to # 0 or 1. Prevents inf z-values. pAgF_p_FWE[pAgF_p_FWE < eps] = eps pAgF_p_FWE[pAgF_p_FWE > (1. - eps)] = 1. - eps pAgF_z_FWE = p_to_z(pAgF_p_FWE, tail='two') * pAgF_sign images['consistency_p_FWE'] = pAgF_p_FWE images['consistency_z_FWE'] = pAgF_z_FWE # pFgA_FWE pFgA_null_chi2_dist = np.squeeze(pFgA_null_chi2_dist) pFgA_p_FWE = np.empty_like(pFgA_chi2_vals).astype(float) for voxel in range(pFgA_chi2_vals.shape[0]): pFgA_p_FWE[voxel] = null_to_p(pFgA_chi2_vals[voxel], pFgA_null_chi2_dist, tail='upper') # Crop p-values of 0 or 1 to nearest values that won't evaluate to # 0 or 1. Prevents inf z-values. pFgA_p_FWE[pFgA_p_FWE < eps] = eps pFgA_p_FWE[pFgA_p_FWE > (1. - eps)] = 1. - eps pFgA_z_FWE = p_to_z(pFgA_p_FWE, tail='two') * pFgA_sign images['specificity_p_FWE'] = pFgA_p_FWE images['specificity_z_FWE'] = pFgA_z_FWE elif corr == 'FDR': _, pAgF_p_FDR, _, _ = multipletests(pAgF_p_vals, alpha=0.05, method='fdr_bh', is_sorted=False, returnsorted=False) pAgF_z_FDR = p_to_z(pAgF_p_FDR, tail='two') * pAgF_sign images['consistency_z_FDR'] = pAgF_z_FDR _, pFgA_p_FDR, _, _ = multipletests(pFgA_p_vals, alpha=0.05, method='fdr_bh', is_sorted=False, returnsorted=False) pFgA_z_FDR = p_to_z(pFgA_p_FDR, tail='two') * pFgA_sign images['specificity_z_FDR'] = pFgA_z_FDR self.results = MetaResult(self, mask=self.mask, **images)
fisherResults[id_value] = {id_label: id_value, termlabel: termname, 'p-Value': pvalue, '#Test': ct.loc['Test_set', ct.columns[0]], '#Ref': ct.loc['Reference_set', ct.columns[0]], '#notAnnotTest': ct.loc['Test_set', ct.columns[1]], '#notAnnotRef': ct.loc['Reference_set', ct.columns[1]], 'Over/Under': sig, 'TestSeqs': genelist_test, 'RefSeqs': genelist_ref } fr = pd.DataFrame(fisherResults).T benjamini = sm.multipletests(fr['p-Value'], method = 'fdr_bh', alpha=args.thresh) fr = pd.concat([fr, pd.Series(benjamini[1], name='FDR', index=fr.index)], axis=1) #p-adjusted fr = pd.concat([fr, pd.Series(benjamini[0], name='FDR_TEST', index=fr.index)], axis=1) #is_rejected # sns.set(color_codes=True) # sns.distplot(fr['p-Value'], kde=False, bins=20) # sns.plt.show() fr_filtered = fr[fr['p-Value'] <= args.thresh] fr_filtered.to_csv(os.path.join(basedir, outfile + "ix"), columns=[termlabel,'FDR','p-Value','#Test','#Ref','#notAnnotTest','#notAnnotRef','Over/Under','TestSeqs','RefSeqs'], header=True, index_label=id_label, sep='\t')
def p_roi_masking(substitution, ts_file_template, beta_file_template, p_file_template, design_file_template, event_file_template, p_level, brain_mask): """Apply a substitution pattern to timecourse, beta, and design file templates - and mask the data of the former two according to a roi. Subsequently scale the design by the mean beta. Parameters ---------- substitution : dict A dictionary containing the template replacement fields as keys and identifiers as values. ts_file_template : string Timecourse file template with replacement fields. The file should be in NIfTI format. beta_file_template : string Beta file template with replacement fields. The file should be in NIfTI format. design_file_template : string Design file template with replacement fields. The file should be in CSV format. roi_path : string Path to the region of interest file based on which to create a mask for the time course and beta files. The file should be in NIfTI format. brain_mask : string Path to the a mask file in the *exact same* coordinate space as the input image. This is very important, as the mask is needed to crop out artefactual p=0 values. These cannot just be filtered out nummerically, since it is possible that the GLM resturns p=0 for the most significant results. Returns ------- timecourse : array_like Numpy array containing the mean timecourse in the region of interest. design : array_like Numpy array containing the regressor scaled by the mean beta value of the region of interest.. mask_map : data Nibabel image of the mask subplot_title : string Title for the subplot, computed from the substitution fields. """ ts_file = path.abspath(path.expanduser(ts_file_template.format(**substitution))) beta_file = path.abspath(path.expanduser(beta_file_template.format(**substitution))) p_file = path.abspath(path.expanduser(p_file_template.format(**substitution))) design_file = path.abspath(path.expanduser(design_file_template.format(**substitution))) event_file = path.abspath(path.expanduser(event_file_template.format(**substitution))) brain_mask = path.abspath(path.expanduser(brain_mask)) try: img = nib.load(p_file) brain_mask = nib.load(brain_mask) except (FileNotFoundError, nib.py3k.FileNotFoundError): return None,None,None,None,None data = img.get_data() brain_mask = brain_mask.get_data() header = img.header affine = img.affine shape = data.shape data = data.flatten() brain_mask = brain_mask.flatten() brain_mask = brain_mask.astype(bool) brain_data = data[brain_mask] reject, nonzero_data, _, _ = multipletests(brain_data, p_level, method="fdr_bh") brain_mask[brain_mask]=reject brain_mask = brain_mask.astype(int) mask = brain_mask.reshape(shape) mask_map = nib.Nifti1Image(mask, affine, header) masker = NiftiMasker(mask_img=mask_map) try: timecourse = masker.fit_transform(ts_file).T betas = masker.fit_transform(beta_file).T except ValueError: return None,None,None,None,None subplot_title = "\n ".join([str(substitution["subject"]),str(substitution["session"])]) timecourse = np.mean(timecourse, axis=0) design = pd.read_csv(design_file, skiprows=5, sep="\t", header=None, index_col=False) design = design*np.mean(betas) event_df = pd.read_csv(event_file, sep="\t") return timecourse, design, mask_map, event_df, subplot_title
for t in TARGETS_CLIN_BL for r in REGRESSORS_OI] + \ ['%s~%s+AGE_AT_INCLUSION+SEX+EDUCATION' % (t, r) for t in TARGETS_NI for r in REGRESSORS_OI] mod = MULM(data=data, formulas=formulas_all_simple) stats_all_simple = mod.t_test(contrasts=1, out_filemane=None) mod = MULM(data=data, formulas=formulas_all_covars) stats_all_covars = mod.t_test(contrasts=1, out_filemane=None) mod = MULM(data=data, formulas=formulas_all) stats_all = mod.t_test(contrasts=1, out_filemane=None) mod = MULM(data=data, formulas=formulas_oi) stats_oi = mod.t_test(contrasts=1, out_filemane=None) stats_oi["Corrected P value"] = multipletests(stats_oi.pvalue, method='fdr_bh')[1] summary = stats_oi.copy() summary["Variable"] = summary.target.replace({ 'TMTB_TIME': 'TMTB', "MDRS_TOTAL": "MDRS", "MRS": "mRS" }) summary["PC"] = summary.contrast.replace({ 'pc1__tvl1l2': 1, 'pc2__tvl1l2': 2, 'pc3__tvl1l2': 3 }) summary["P value"] = summary.pvalue summary["t statistic"] = summary.tvalue
def CoexpressionAnalysis(client, SL_or_SDL, data_resource, input_genes, adj_method, fdr_level, tissues): ''' Description: "The gene correlation information is used to detect SL pairs." Inputs: client:BigQueryClient, the BigQuery client that will run the function. SL_or_SDL:string, Synthetic lethal or Synthetic Dosage Lethal, valid values: 'SL', 'SDL' data_resource: string, The dataresource the analysis will be performed on, valid values: "CCLE", "PanCancerAtlas" input_genes:list of strings, the list of genes whose SL/SDL partners will be seeked adj_method: string, optional, p value correction method, valid_values:bonferroni, sidak, holm-sidak , holm, simes-hochberg , hommel, fdr_bh, fdr_by , fdr_tsbh, fdr_tsbky fdr_level:string, the data that will be considered wile doing p value adjustment, valid values : "gene_level", "analysis_level" tissues: The tissues that the analysis will be performed on. Output: A dataframe of SL/SDL pairs ''' if data_resource == 'PanCancerAtlas': table_name = 'isb-cgc-bq.pancancer_atlas.Filtered_EBpp_AdjustPANCAN_IlluminaHiSeq_RNASeqV2_genExp' gene_col_name = 'Symbol' entrez_col_name = 'Entrez' exp_name = 'normalized_count' sample_barcode = 'SampleBarcode' selected_samples = RetrieveSamples(client, 'PanCancerAtlas', 'correlation', tissues) gene_mapping = ProcessGeneAlias(client, input_genes, 'PanCancerAtlas') elif data_resource == 'CCLE': table_name = 'isb-cgc-bq.DEPMAP.CCLE_gene_expression_DepMapPublic_current' gene_col_name = 'Hugo_Symbol' exp_name = 'TPM' sample_barcode = 'DepMap_ID' entrez_col_name = 'Entrez_ID' selected_samples = RetrieveSamples(client, 'CCLE', 'correlation', tissues) gene_mapping = ProcessGeneAlias(client, input_genes, 'DepMap') else: print("The database name can be either PanCancerAtlas or CCLE") return () min_sample_size = 20 if len(selected_samples) < (min_sample_size + 1): print("Sample size needs to be greater than " + str(min_sample_size) + ", it is " + str(len(selected_samples))) return () sql_correlation = """ CREATE TEMPORARY FUNCTION tscore_to_p(a FLOAT64, b FLOAT64, c FLOAT64) RETURNS FLOAT64 LANGUAGE js AS \"\"\" return jStat.ttest(a,b,c); //jStat.ttest( tscore, n, sides) \"\"\" OPTIONS ( library="gs://javascript-lib/jstat.min.js" ); WITH table1 AS ( SELECT symbol, (RANK() OVER (PARTITION BY symbol ORDER BY data ASC)) + (COUNT(*) OVER ( PARTITION BY symbol, CAST(data as STRING)) - 1)/2.0 AS rnkdata, ParticipantBarcode FROM ( SELECT __GENE_SYMBOL__ symbol, AVG( __EXP_NAME__) AS data, __SAMPLE_ID__ AS ParticipantBarcode FROM `__TABLE_NAME__` WHERE __GENE_SYMBOL__ IN (__GENE_LIST__) # labels AND __EXP_NAME__ IS NOT NULL AND __SAMPLE_ID__ in (__SAMPLE_LIST__) GROUP BY ParticipantBarcode, symbol ) ) , table2 AS ( SELECT symbol, (RANK() OVER (PARTITION BY symbol ORDER BY data ASC)) + (COUNT(*) OVER ( PARTITION BY symbol, CAST(data as STRING)) - 1)/2.0 AS rnkdata, ParticipantBarcode FROM ( SELECT __GENE_SYMBOL__ symbol, AVG(__EXP_NAME__) AS data, __SAMPLE_ID__ AS ParticipantBarcode FROM `__TABLE_NAME__` WHERE __GENE_SYMBOL__ IS NOT NULL # labels AND __EXP_NAME__ IS NOT NULL AND __SAMPLE_ID__ in (__SAMPLE_LIST__) GROUP BY ParticipantBarcode, symbol ) ) , summ_table AS ( SELECT n1.symbol as symbol1, n2.symbol as symbol2, COUNT( n1.ParticipantBarcode ) as n, CORR(n1.rnkdata , n2.rnkdata) as correlation FROM table1 AS n1 INNER JOIN table2 AS n2 ON n1.ParticipantBarcode = n2.ParticipantBarcode AND n2.symbol NOT IN (__GENE_LIST__) GROUP BY symbol1, symbol2 UNION ALL SELECT n1.symbol as symbol1, n2.symbol as symbol2, COUNT( n1.ParticipantBarcode ) as n, CORR(n1.rnkdata , n2.rnkdata) as correlation FROM table1 AS n1 INNER JOIN table1 AS n2 ON n1.ParticipantBarcode = n2.ParticipantBarcode AND n1.symbol < n2.symbol GROUP BY symbol1, symbol2 ) SELECT *, tscore_to_p( ABS(correlation)*SQRT( (n-2)/((1+correlation)*(1-correlation))) ,n-2, 2) as pvalue #`cgc-05-0042.Auxiliary.significance_level_ttest2`(n-2, ABS(correlation)*SQRT( (n-2)/((1+correlation)*(1-correlation)))) as alpha FROM summ_table WHERE n > 20 #AND correlation > __COR_THRESHOLD__ GROUP BY 1,2,3,4,5 #HAVING pvalue <= __P_THRESHOLD__ ORDER BY symbol1 ASC, correlation DESC """ input_genes = ["'" + str(x) + "'" for x in input_genes] input_genes_for_query = ','.join(input_genes) included_samples = ["'" + str(x) + "'" for x in selected_samples] included_samples = ','.join(included_samples) sql_correlation = sql_correlation.replace('__GENE_LIST__', input_genes_for_query) sql_correlation = sql_correlation.replace('__TABLE_NAME__', table_name) sql_correlation = sql_correlation.replace('__GENE_SYMBOL__', gene_col_name) sql_correlation = sql_correlation.replace('__EXP_NAME__', exp_name) sql_correlation = sql_correlation.replace('__SAMPLE_ID__', sample_barcode) sql_correlation = sql_correlation.replace('__SAMPLE_LIST__', included_samples) results = client.query(sql_correlation).result().to_dataframe() if results.shape[0] < 1: print("Coexpression inference procedure applied on " + data_resource + " did not find candidate " + SL_or_SDL + " pairs.") return (results) report = results[['symbol1', 'symbol2', 'n', 'correlation', 'pvalue']] report = report.dropna() report.columns = [ 'InactiveDB', 'SL_Candidate', '#Samples', 'Correlation', 'PValue' ] report['Inactive'] = report['InactiveDB'].map(gene_mapping) if fdr_level == "gene_level": inactive_genes = list(report["Inactive"].unique()) for i in range(len(inactive_genes)): report.loc[report["Inactive"] == inactive_genes[i], 'FDR'] = multipletests( report.loc[report["Inactive"] == inactive_genes[i], 'PValue'], method=adj_method, is_sorted=False)[1] elif fdr_level == "analysis_level": FDR = multipletests(report['PValue'], method=adj_method, is_sorted=False)[1] report['FDR'] = FDR else: print("FDR level can be either gene_level or analysis_level") return () report['Tissue'] = str(tissues) cols = [ 'Inactive', 'InactiveDB', 'SL_Candidate', '#Samples', 'Correlation', 'PValue', 'FDR', 'Tissue' ] report = report[cols] if SL_or_SDL == "SDL": report.columns = [ 'Overactive', 'OveractiveDB', 'SL_Candidate', '#Samples', 'Correlation', 'PValue', 'FDR', 'Tissue' ] return report
valueable_differencies_count += 1 if min_pvalue > wilcox.pvalue: min_pvalue = wilcox.pvalue min_first = first min_second = second comparison_frame = pd.DataFrame(comparison_result, columns=["Names", "Statistic", "p-value"]) comparison_frame #%% print "\nMost different classificators: \"%s\" and \"%s\" with p-value: %f" % (min_first, min_second, min_pvalue) # Сколько статистически значимых на уровне 0.05 различий мы обнаружили? #%% print "Statistically valuable differencies count: %i" % valueable_differencies_count # Сравнивая 4 классификатора между собой, мы проверили 6 гипотез. # Давайте сделаем поправку на множественную проверку. Начнём с метода Холма. # Сколько гипотез можно отвергнуть на уровне значимости 0.05 после поправки этим методом? #%% from statsmodels.sandbox.stats.multicomp import multipletests reject_holm, p_corrected_holm, a1_holm, a2_holm = multipletests(comparison_frame["p-value"], alpha = 0.05, method = 'holm') print "Hypothesis to reject after holm correction count: %i" % len(filter(lambda whether_reject: whether_reject, reject_holm)) # Сколько гипотез можно отвергнуть на уровне значимости 0.05 после поправки методом # Бенджамини-Хохберга? #%% reject_fdr, p_corrected_fdr, a1_fdr, a2_fdr = multipletests(comparison_frame["p-value"], alpha = 0.05, method = 'fdr_bh') print "Hypothesis to reject after fdr correction count: %i" % len(filter(lambda whether_reject: whether_reject, reject_fdr))
beds = [] for i in bed_paths: beds.append(pd.read_table(i, sep="\t", names=["Region", "Start", "End", "Name", "Score", "Strand"])) vdfs = [] for _i,i in enumerate(df_paths): _ = pd.read_table(i, names=["POS", "REF", "ALT", "AD", "REV", "DP", "QUAL"], skiprows = 1) vdfs.append(_) # Fisher's Exact Test # | AD | DP | # Variant | | | # Threshold | 3 | 100 | for vdf in vdfs: pvals = vdf.apply(lambda x: fe([[x["AD"], x["DP"]], [(freq/100)*x["DP"], x["DP"]]], "greater"), axis = 1) vdf["threshold_"+str(freq)+"%_pval"] = multipletests([i[1] for i in pvals], method="fdr_bh")[1] vdf["threshold_"+str(freq)+"%_oddsratio"] = [i[0] for i in pvals] threshold = freq col = "threshold_"+str(threshold)+"%" pval_threshold = 0.05 _ = vdfs # _ = [i[i[col+"_pval"]<=pval_threshold] for i in _] df = _[0] for _i, i in enumerate(_[1:]): df = df.merge(i, how='inner', on=['POS', 'REF', 'ALT'], suffixes = ("_0", "_"+str(_i+1))) cols = df.columns[df.columns.str.match(r"\b"+col+"\b*_pval")] df = df.ix[df[cols].apply(lambda x: any([i<= pval_threshold for i in x]), axis = 1)] masked = []
def SurvivalOfFittest(client, SL_or_SDL, data_source, input_genes, percentile_threshold, cn_threshold, adj_method, fdr_level, tissues, input_mutations='None'): ''' Description: Gene expression, Copy Number Alteration (CNA), Somatic Mutations are used to decide whether gene is inactive. The SL pair detection according to difference in CNA given one gene is inactive vs not-inactive Inputs: client:BigQueryClient, the BigQuery client that will run the function. SL_or_SDL:string, Synthetic lethal or Synthetic Dosage Lethal, valid values: 'SL', 'SDL' data_resource: string, The dataresource the analysis will be performed on, valid values: "CCLE", "PanCancerAtlas" input_genes:list of strings, the list of genes whose SL/SDL partners will be seeked percentile_threshold:double, the threshold for gene expression (for deciding whether a gene is inactive) cn_threshold:double, the threshold for copy number alteration (for deciding whether a gene is inactive) adj_method: string, optional, p value correction method, valid_values:bonferroni, sidak, holm-sidak , holm, simes-hochberg , hommel, fdr_bh, fdr_by , fdr_tsbh, fdr_tsbky fdr_level:string, the data that will be considered wile doing p value adjustment, valid values : "gene_level", "analysis_level" tissues: The tissues that the analysis will be performed on. input_mutations:list of strings, optional, valid values: Missense_Mutation, Nonsense_Mutation,Translation_Start_Site, Frame_Shift_Ins, Splice_Site, In_Frame_DelFrame_Shift_Del, Nonstop_Mutation, In_Frame_Ins Output: A dataframe of SL/SDL pairs ''' if data_source == 'PanCancerAtlas': gene_exp_table = 'isb-cgc-bq.pancancer_atlas.Filtered_EBpp_AdjustPANCAN_IlluminaHiSeq_RNASeqV2_genExp' mutation_table = 'isb-cgc-bq.pancancer_atlas.Filtered_MC3_MAF_V5_one_per_tumor_sample' cn_table = 'isb-cgc-bq.pancancer_atlas.Filtered_all_CNVR_data_by_gene' sample_id = 'SampleBarcode' gene_col_name = 'Symbol' gene_exp = 'normalized_count' cn_gene_name = 'Gene_Symbol' mutation_gene_name = 'Hugo_Symbol' mutation_sample_id = 'Tumor_SampleBarcode' cn_gistic = 'GISTIC_Calls' entrez_id = 'Entrez' selected_samples = RetrieveSamples(client, 'PanCancerAtlas', 'sof', tissues) gene_mapping = ProcessGeneAlias(client, input_genes, 'PanCancerAtlas') elif data_source == 'CCLE': mutation_table = 'isb-cgc-bq.DEPMAP.CCLE_mutation_DepMapPublic_current' gene_exp_table = 'isb-cgc-bq.DEPMAP.CCLE_gene_expression_DepMapPublic_current' cn_table = 'isb-cgc-bq.DEPMAP.CCLE_gene_cn_DepMapPublic_current' sample_id = 'DepMap_ID' gene_col_name = 'Hugo_Symbol' gene_exp = 'TPM' cn_gene_name = 'Hugo_Symbol' mutation_gene_name = 'Hugo_Symbol' mutation_sample_id = 'Tumor_Sample_Barcode' cn_gistic = 'CNA' cn_threshold = np.log2(2**(cn_threshold) + 1) entrez_id = 'Entrez_ID' selected_samples = RetrieveSamples(client, 'CCLE', 'sof', tissues) gene_mapping = ProcessGeneAlias(client, input_genes, 'DepMap') else: print("The data source name can be either PanCancerAtlas or CCLE") return () min_sample_size = 20 if len(selected_samples) < (min_sample_size + 1): print("Sample size needs to be greater than " + str(min_sample_size), " it is " + str(len(selected_samples))) return () sql_without_mutation = ''' WITH table1 AS ( (SELECT symbol, Barcode FROM (SELECT GE.__EXP_GENE_NAME__ AS symbol, GE.__SAMPLE_ID__ AS Barcode , PERCENT_RANK () over (partition by __EXP_GENE_NAME__ order by __GENE_EXPRESSION__ asc) AS Percentile FROM __GENE_EXP_TABLE__ GE WHERE GE.__EXP_GENE_NAME__ in (__GENELIST__) AND __SAMPLE_ID__ in (__SAMPLE_LIST__) AND GE.__GENE_EXPRESSION__ is not null ) AS NGE WHERE NGE.Percentile __GENE_CMP_STR__ INTERSECT DISTINCT SELECT symbol , Barcode FROM (SELECT CN.__CN_GENE_NAME__ AS symbol, CN.__SAMPLE_ID__ AS Barcode, CN.__CN_GISTIC__ AS NORM_CN FROM __CN_TABLE__ CN WHERE CN.__CN_GENE_NAME__ in (__GENELIST__) AND __SAMPLE_ID__ in (__SAMPLE_LIST__) and CN.__CN_GISTIC__ is not null ) AS NC WHERE NC.NORM_CN __CN_CMP_STR__ )''' if data_source == 'CCLE': sql_mutation_part = ''' UNION DISTINCT SELECT M.__MUTATION_GENE_NAME__ AS symbol , M.__MUTATION_SAMPLE_ID__ AS Barcode FROM __MUTATION_TABLE__ M WHERE __MUTATION_GENE_NAME__ IN (__GENELIST__) AND M.Variant_Classification IN (__MUTATIONLIST__) AND __MUT_SAMPLE_ID__ in (__SAMPLE_LIST__) )''' elif data_source == 'PanCancerAtlas': sql_mutation_part = ''' UNION DISTINCT SELECT M.__MUTATION_GENE_NAME__ AS symbol , M.__MUTATION_SAMPLE_ID__ AS Barcode FROM __MUTATION_TABLE__ M WHERE __MUTATION_GENE_NAME__ IN (__GENELIST__) AND M.Variant_Classification IN (__MUTATIONLIST__) AND __MUT_SAMPLE_ID__ in (__SAMPLE_LIST__) AND Filter="PASS" )''' rest_of_the_query = ''' , table2 AS ( SELECT __SAMPLE_ID__ Barcode, __CN_GENE_NAME__ symbol, (RANK() OVER (PARTITION BY __CN_GENE_NAME__ ORDER BY __CN_GISTIC__ ASC)) + (COUNT(*) OVER ( PARTITION BY __CN_GENE_NAME__, CAST(__CN_GISTIC__ as STRING)) - 1)/2.0 AS rnkdata FROM __CN_TABLE__ where __CN_GENE_NAME__ IS NOT NULL AND __SAMPLE_ID__ in (__SAMPLE_LIST__) AND __CN_GISTIC__ is not null ), summ_table AS ( SELECT n1.symbol as symbol1, n2.symbol as symbol2, COUNT( n1.Barcode) as n_1, SUM( n2.rnkdata ) as sumx_1, FROM table1 AS n1 INNER JOIN table2 AS n2 ON n1.Barcode = n2.Barcode GROUP BY symbol1, symbol2 ), statistics AS ( SELECT symbol1, symbol2, n1, n, U1, (n1n2/2.0 - U1)/den as zscore FROM ( SELECT symbol1, symbol2, n_t as n, n_1 as n1, sumx_1 - n_1 *(n_1 + 1) / 2.0 as U1, n_1 * (n_t - n_1 ) as n1n2, SQRT( n_1 * (n_t - n_1 )*(n_t + 1) / 12.0 ) as den FROM summ_table as t1 LEFT JOIN ( SELECT symbol, COUNT( Barcode ) as n_t FROM table2 GROUP BY symbol) t2 ON symbol2 = symbol WHERE n_t > 20 and n_1>5 ) WHERE den > 0 ) SELECT symbol1, symbol2, n1, n, U1, `cgc-05-0042.functions.jstat_normal_cdf`(zscore, 0.0, 1.0 ) as pvalue FROM statistics GROUP BY 1,2,3,4,5,6 #HAVING pvalue <= 0.01 ORDER BY pvalue ASC ''' input_genes = ["'" + str(x) + "'" for x in input_genes] input_genes_query = ','.join(input_genes) included_samples = ["'" + str(x) + "'" for x in selected_samples] included_samples = ','.join(included_samples) if SL_or_SDL == 'SDL' or input_mutations is None: sql_sof = sql_without_mutation + ')' + ' ' + rest_of_the_query else: mutations_intermediate_representation = [ "'" + x + "'" for x in input_mutations ] input_mutations_for_query = ','.join( mutations_intermediate_representation) sql_sof = sql_without_mutation + ' ' + sql_mutation_part + ' ' + rest_of_the_query sql_sof = sql_sof.replace('__MUTATION_TABLE__', mutation_table) sql_sof = sql_sof.replace('__MUTATION_SAMPLE_ID__', mutation_sample_id) sql_sof = sql_sof.replace('__MUTATIONLIST__', input_mutations_for_query) sql_sof = sql_sof.replace('__GENELIST__', input_genes_query) # sql_sof = sql_sof.replace('__CUTOFFPRC__', str(percentile_threshold/100)) # sql_sof = sql_sof.replace('__CUTOFFSCNA__', str(cn_threshold)) sql_sof = sql_sof.replace('__CN_TABLE__', cn_table) sql_sof = sql_sof.replace('__GENE_EXP_TABLE__', gene_exp_table) sql_sof = sql_sof.replace('__SAMPLE_ID__', sample_id) sql_sof = sql_sof.replace('__MUT_SAMPLE_ID__', mutation_sample_id) sql_sof = sql_sof.replace('__ENTREZ_ID__', entrez_id) sql_sof = sql_sof.replace('__CN_TABLE__', cn_table) sql_sof = sql_sof.replace('__GENE_EXPRESSION__', gene_exp) sql_sof = sql_sof.replace('__CN_GISTIC__', cn_gistic) sql_sof = sql_sof.replace('__EXP_GENE_NAME__', gene_col_name) sql_sof = sql_sof.replace('__CN_GENE_NAME__', cn_gene_name) sql_sof = sql_sof.replace('__MUTATION_GENE_NAME__', mutation_gene_name) sql_sof = sql_sof.replace('__SAMPLE_LIST__', included_samples) if SL_or_SDL == "SL": comp_str = "<" + str(cn_threshold) com_gene_th = "<" + str(percentile_threshold / 100) elif SL_or_SDL == "SDL": comp_str = ">" + str(cn_threshold) sql_sof = sql_sof.replace('__CN_CMP_STR__', comp_str) com_gene_th = ">" + str(percentile_threshold / 100) sql_sof = sql_sof.replace('__GENE_CMP_STR__', com_gene_th) sql_sof = sql_sof.replace('__CN_CMP_STR__', comp_str) sql_sof = sql_sof.replace('__GENE_CMP_STR__', com_gene_th) results = client.query(sql_sof).result().to_dataframe() if results.shape[0] < 1: print("SOF inference procedure applied on " + data_resource + " did not find candidate " + SL_or_SDL + " pairs.") return (results) report = results[['symbol1', 'symbol2', 'n1', 'n', 'U1', 'pvalue']] report = report.dropna() report.columns = [ 'InactiveDB', 'SL_Candidate', '#InactiveSamples', '#Samples', 'U1', 'PValue' ] report['Inactive'] = report['InactiveDB'].map(gene_mapping) if fdr_level == "gene_level": inactive_genes = list(report["Inactive"].unique()) for i in range(len(inactive_genes)): report.loc[report["Inactive"] == inactive_genes[i], 'FDR'] = multipletests( report.loc[report["Inactive"] == inactive_genes[i], 'PValue'], method=adj_method, is_sorted=False)[1] elif fdr_level == "analysis_level": FDR = multipletests(report['PValue'], method=adj_method, is_sorted=False)[1] report['FDR'] = FDR else: print("FDR level can be either gene_level or analysis_level") return () report['Tissue'] = str(tissues) cols = [ 'Inactive', 'InactiveDB', 'SL_Candidate', '#InactiveSamples', '#Samples', 'PValue', 'FDR', 'Tissue' ] report = report[cols] if SL_or_SDL == "SDL": report.columns = [ 'Overactive', 'OveractiveDB', 'SL_Candidate', '#Overactive', '#Samples', 'PValue', 'FDR', 'Tissue' ] return report
Files = conf.inputs for File in Files: with open(File) as f: Lines = f.readlines() for line in Lines: split_line = line.split() Module_ID = str(split_line[0]) Gene_type = str(split_line[1]) P_value = float(split_line[2]) key = "_".join([Module_ID, Gene_type]) keys.append(key) P_vals.append(P_value) uncorrected_dict[key] = P_value FDR = conf.FDR Benjamini_Pval_array = stats.multipletests(P_vals, alpha=FDR, method='fdr_bh', is_sorted=False, returnsorted=False) Bonferroni_Pval_array = stats.multipletests(P_vals, alpha=FDR, method='bonferroni', is_sorted=False, returnsorted=False) i = 0 for key in keys: P_value_BH = Benjamini_Pval_array[1][i] BH_dict[key] = P_value_BH P_value_BO = Bonferroni_Pval_array[1][i] BO_dict[key] = P_value_BO i = i + 1 # -----------------------------------------------------
# Correction of multiple comparaison with Bonferronin pvals_GR = model_GRT.pvalues pvals_GR_fwer = multicomp.multipletests(pvals_GR, alpha = 0.05, method = 'bonferroni') """ X1 = np.array(df_drop[df_drop.label==r][['AgeAtDiagnosis', 'mean']]) #X1 = np.array(df_drop[df_drop.label==r]["mean"]) Y1 = np.array(df_drop[df_drop.label==r][VD]) ## Fit and summary: model_SGRT = sm.OLS(Y1, X1).fit() print(model_SGRT.summary()) # Correction of multiple comparaison with Bonferroninipyth pvals = model_SGRT.pvalues pvals_fwer = multicomp.multipletests(pvals, alpha = 0.05, method = 'bonferroni') """ # PLot the significant roi without global mean RT effect pval_roi = pvals_GR_fwer[0] if pval_roi[1:2].astype(str) == "True": mask, roi_nii = get_roi_mask(atlas_nii, label_number) output= os.path.join(maskfile,"%s.png"%(label_number)) plotting.plot_roi(roi_nii, anat_nii, output_file= output, title="plot_roi %s"%(label_number)) else: continue """ # concatenate all the results into a pd dataframe
def run_analysis(self): zdf = flex_array.standard_df(self.par['zscore_file']) f = params.file_IO( '../ref_seq/pep_against_human_viruses.tblastn.species.noseg.WS3.max_target_seqs100000.180625 (1).m8', '\t') orig_aln = f.flat_file_to_df([0, 1, 15]) f = params.file_IO( '../ref_seq/new_pep_against_human_viruses.tblastn.species.noseg.WS3.max_target_seqs100000.180625 (1).m8', '\t') new_aln = f.flat_file_to_df([0, 1, 15]) orig_aln.index = [i.split('_')[1] for i in orig_aln.index] aln_df = pd.concat([orig_aln, new_aln]) aln_df.fillna(0, inplace=True) binary_b = aln_df[aln_df >= 80].fillna(0) binary_b = pd.DataFrame(index=binary_b.index, columns=binary_b.columns, data=binary_b.values, dtype=bool) binary_b = flex_array.array(binary_b).filter_aln( ref_seq=self.par['dir_ref_seq']) binary_b = binary_b.reindex(zdf.index).fillna(0) aln_df = aln_df.loc[:, binary_b.columns] #binary_b = flex_array.sparse_aln_df(self.par['file_aln']) #binary_b = flex_array.array(binary_b).filter_aln(ref_seq=self.par['dir_ref_seq']) sum_df = pd.DataFrame(0, index=list(binary_b), columns=list(zdf)) glob_unique = pd.DataFrame(0, index=list(binary_b), columns=list(zdf)) pep_df = pd.DataFrame(np.nan, index=list(binary_b), columns=list(zdf)) p_df = pd.DataFrame(index=list(binary_b), columns=list(zdf)) n_df = pd.DataFrame(index=list(binary_b), columns=list(zdf)) padjust_df = pd.DataFrame(index=list(binary_b), columns=list(zdf)) orig_p = pd.DataFrame(index=list(binary_b), columns=list(zdf)) filter_df = pd.DataFrame(index=list(binary_b), columns=list(zdf)) hits_series = pd.Series(index=list(zdf)) nonoverlap_hits_series = pd.Series(index=list(zdf)) samples = list(zdf.columns) nonoverlap_dict = {} parallel_dict1 = {} parallel_dict2 = {} for sample_name, column in zdf.iteritems(): hits = column[column >= self.par['Z_threshold']].copy() if self.par['use_filter']: nonoverlap_hits = flex_array.gen_ind_hits( hits, self.dependent_pep, self.par['graph_dir'], samples.index(sample_name)) input_num = len(nonoverlap_hits) elif not self.par['use_filter']: nonoverlap_hits = hits.copy() input_num = len( flex_array.gen_ind_hits(hits, self.dependent_pep, self.par['graph_dir'], samples.index(sample_name))) hits_series[sample_name] = len(hits) nonoverlap_hits_series[sample_name] = input_num nonoverlap_dict[sample_name] = list(nonoverlap_hits.index) print("%s:\thits=%s, nonoverlapped=%s" % (sample_name, len(hits), input_num)) if input_num > 0: zb_df = aln_df.loc[nonoverlap_hits.index] parallel_dict1[sample_name] = zb_df parallel_dict2[sample_name] = nonoverlap_hits ''' collapse_zb, glob_array, sim_tag, p_series, orig_pseries, filter_series = flex_array.array(zb_df).binom_reassign( nonoverlap_hits, self.dependent_pep, self.par['dir_ref_seq'], self.par['p_threshold'], self.par['x_threshold'], self.par['organism']) sum_df[sample_name]=collapse_zb.apply(sum, axis=0) + sim_tag glob_unique[sample_name] = glob_array.apply(sum, axis=0) + sim_tag pep_df[sample_name]=collapse_zb.apply(lambda x: flex_array.array(x).names_string(0.001),axis=0) p_df[sample_name]=p_series orig_p[sample_name]=orig_pseries filter_df[sample_name]=filter_series ''' #parallel_dict1 = pd.Series(parallel_dict1) #parallel_dict2 = pd.Series(parallel_dict2) #parallel_dict2 = parallel_dict.loc[parallel_dict1.index] list1 = list(parallel_dict1.keys()) #sample names list2 = list(parallel_dict1.values()) #zb_df list3 = [parallel_dict2[i] for i in list1] #hits series zipped = zip(list2, list3, list1) results = Parallel(n_jobs=-1)( delayed(flex_array.binom_reassign) (zb_df, nonoverlap_hits, sample_name, self.dependent_pep, self.par['dir_ref_seq'], self.par['p_threshold'], self.par['x_threshold'], self.par['organism']) for zb_df, nonoverlap_hits, sample_name in zipped) r1, r2, r3, r4, r5, r6, r7, r8 = zip(*results) for i in range(len(r7)): sample_name = r7[i] collapse_zb = r1[i] glob_array = r2[i] sim_tag = r3[i] p_series = r4[i] orig_pseries = r5[i] filter_series = r6[i] n_series = r8[i] sum_df[sample_name] = collapse_zb.apply(sum, axis=0) + sim_tag glob_unique[sample_name] = glob_array.apply(sum, axis=0) + sim_tag pep_df[sample_name] = collapse_zb.apply( lambda x: flex_array.array(x).names_string(0.001), axis=0) n_df[sample_name] = n_series p_df[sample_name] = p_series orig_p[sample_name] = orig_pseries filter_df[sample_name] = filter_series file_head = self.par['sub_dir'] + self.par['zscore_file'].split( '/')[-1].split('.')[0] #Removes file path and extension if self.par['organism']: file_head += '_organism_' else: file_head += '_species_' #Write log file params.file_IO(self.par['sub_dir'] + 'parameters.log', sep='=').dict_to_file(self.par) #Write analysis files sum_df.to_csv(file_head + 'total-counts.txt', sep='\t', header=True, index_label='Specie') glob_unique.to_csv(file_head + 'unique-counts.txt', sep='\t', header=True, index_label='Specie') pep_df.to_csv(file_head + 'peptides.txt', sep='\t', header=True, index_label='Specie') p_df.to_csv(file_head + 'p-values.txt', sep='\t', header=True, index_label='Specie') orig_p.to_csv(file_head + 'orig-p-values.txt', sep='\t', header=True, index_label='Specie') filter_df.to_csv(file_head + 'virus-filter.txt', sep='\t', header=True, index_label='Specie') for i in p_df.columns: pvals = np.array(p_df[i].values) if not pd.isnull(pvals).all(): mask = [j for j in np.where(np.isfinite(pvals))[0]] pval_corrected = np.empty(pvals.shape) pval_corrected.fill(np.nan) pval_corrected[mask] = multipletests(pvals[mask], method='fdr_bh')[1] padjust_df[i] = pval_corrected padjust_df.to_csv(file_head + 'p-adjusted.txt', sep='\t', header=True, index_label='Specie') #Write independent peptides file f = open(self.par['sub_dir'] + 'independent_peptides.txt', 'w') for i in samples: f.write(i) for j in nonoverlap_dict[i]: f.write('\t' + str(j)) f.write('\n') f.close() #Write summary file f = open(file_head + 'results_summary.txt', 'w') f.write( "Sample name\tVirus\tBH p-value\tRaw p-value\tOrig p-value\tAssigned counts\tFiltered Assigned Counts\t" ) f.write( "Assigned peptides\tTotal significant peptides\tRanking N\tTotal sample hits\tTotal filtered sample hits\n" ) for i in samples: BH = padjust_df[i] BH = BH[BH < self.par['bh_threshold']] p_value = p_df[i] n_value = n_df[i] n_value = n_value[BH.index] p_value = p_value[BH.index] filter_value = filter_df[i] filter_value = filter_value[BH.index] orig_pvalue = orig_p[i] orig_pvalue = orig_pvalue[BH.index] counts = sum_df[i] counts = counts[BH.index] peptides = pep_df[i] peptides = peptides[BH.index] for j in BH.index: if filter_value[j] > self.par['x_threshold']: f.write(i + '\t') f.write(j + '\t' + str(BH[j]) + '\t') f.write( str(p_value[j]) + '\t' + str(orig_pvalue[j]) + '\t') f.write( str(counts[j]) + '\t' + str(filter_value[j]) + '\t' + str(peptides[j]) + '\t') #write number of peptides pep_set = set() for k in BH.index: pep_list = peptides[k].split(';') pep_set = pep_set.union(set(pep_list)) f.write(str(len(pep_set)) + '\t') f.write(str(n_value[j]) + '\t') f.write( str(hits_series[i]) + '\t' + str(nonoverlap_hits_series[i]) + '\n') f.close() print("End of run.") return None
def alpha_group_significance(output_dir: str, alpha_diversity: pd.Series, metadata: qiime2.Metadata) -> None: # Filter metadata to only include IDs present in the alpha diversity data. # Also ensures every alpha diversity ID is present in the metadata. metadata = metadata.filter_ids(alpha_diversity.index) # Metadata column filtering could be done in one pass, but this visualizer # displays separate warnings for non-categorical columns, and categorical # columns that didn't satisfy the requirements of the statistics being # computed. pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(column_type='categorical') non_categorical_columns = pre_filtered_cols - set(metadata.columns) pre_filtered_cols = set(metadata.columns) metadata = metadata.filter_columns(drop_all_unique=True, drop_zero_variance=True, drop_all_missing=True) filtered_columns = pre_filtered_cols - set(metadata.columns) if len(metadata.columns) == 0: raise ValueError( "Metadata does not contain any columns that satisfy this " "visualizer's requirements. There must be at least one metadata " "column that contains categorical data, isn't empty, doesn't " "consist of unique values, and doesn't consist of exactly one " "value.") metric_name = alpha_diversity.name # save out metadata for download in viz alpha_diversity.index.name = 'id' alpha = qiime2.Metadata(alpha_diversity.to_frame()) md = metadata.merge(alpha) md.save(os.path.join(output_dir, 'metadata.tsv')) filenames = [] filtered_group_comparisons = [] for column in metadata.columns: metadata_column = metadata.get_column(column) metadata_column = metadata_column.drop_missing_values() initial_data_length = alpha_diversity.shape[0] data = pd.concat( [alpha_diversity, metadata_column.to_series()], axis=1, join='inner') filtered_data_length = data.shape[0] names = [] groups = [] for name, group in data.groupby(metadata_column.name): names.append('%s (n=%d)' % (name, len(group))) groups.append(list(group[metric_name])) escaped_column = quote(column) escaped_column = escaped_column.replace('/', '%2F') filename = 'column-%s.jsonp' % escaped_column filenames.append(filename) # perform Kruskal-Wallis across all groups kw_H_all, kw_p_all = scipy.stats.mstats.kruskalwallis(*groups) # perform pairwise Kruskal-Wallis across all pairs of groups and # correct for multiple comparisons kw_H_pairwise = [] for i in range(len(names)): for j in range(i): try: H, p = scipy.stats.mstats.kruskalwallis( groups[i], groups[j]) kw_H_pairwise.append([names[j], names[i], H, p]) except ValueError: filtered_group_comparisons.append([ '%s:%s' % (column, names[i]), '%s:%s' % (column, names[j]) ]) kw_H_pairwise = pd.DataFrame( kw_H_pairwise, columns=['Group 1', 'Group 2', 'H', 'p-value']) kw_H_pairwise.set_index(['Group 1', 'Group 2'], inplace=True) kw_H_pairwise['q-value'] = multipletests(kw_H_pairwise['p-value'], method='fdr_bh')[1] kw_H_pairwise.sort_index(inplace=True) pairwise_fn = 'kruskal-wallis-pairwise-%s.csv' % escaped_column pairwise_path = os.path.join(output_dir, pairwise_fn) kw_H_pairwise.to_csv(pairwise_path) with open(os.path.join(output_dir, filename), 'w') as fh: series = pd.Series(groups, index=names) fh.write("load_data('%s'," % column) series.to_json(fh, orient='split') fh.write(",") json.dump( { 'initial': initial_data_length, 'filtered': filtered_data_length }, fh) fh.write(",") json.dump({'H': kw_H_all, 'p': kw_p_all}, fh) fh.write(",'") table = q2templates.df_to_html(kw_H_pairwise) fh.write(table.replace('\n', '').replace("'", "\\'")) fh.write("','%s', '%s');" % (quote(pairwise_fn), metric_name)) index = os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'index.html') q2templates.render( index, output_dir, context={ 'columns': [quote(fn) for fn in filenames], 'non_categorical_columns': ', '.join(sorted(non_categorical_columns)), 'filtered_columns': ', '.join(sorted(filtered_columns)), 'filtered_group_comparisons': '; '.join([' vs '.join(e) for e in filtered_group_comparisons]) }) shutil.copytree( os.path.join(TEMPLATES, 'alpha_group_significance_assets', 'dist'), os.path.join(output_dir, 'dist'))
def fdr(x): return multipletests(x, method='fdr_bh', alpha=0.05 / pvals.shape[1])[1]
# print dof # print expected import statsmodels.stats.multicomp # print frame['a'] # print pd.crosstab(frame['a'],frame['b']) # crossFD = pd.crosstab(frame['a'],frame['b']) # print chi2_contingency([crossFD['a'],crossFD['b']],False) # print chi2_contingency([frame['a'],frame['b']],True) from scipy.stats import chisquare # chisquare_value, race_pvalue = chisquare(frame['a'], frame['b']) # print chisquare_value, race_pvalue # print(chisquare(f_obs=frame['a'], f_exp=frame['b']))[1] import scipy from scipy.stats import chisquare # print chisquare(frame['a'], f_exp=frame['b']) # # # print chisquare(frame['a'], f_exp=frame['b'], ddof=1) # # print chisquare(frame['a']) # print scipy.stats.chi2_contingency([frame['a'],frame['b'] ]) from statsmodels.sandbox.stats.multicomp import multipletests print multipletests(frame["treat1"]) # print multipletests(frame["block1"])
def GLM(file, score, stat, ind_var, Level, betas=1): # Create pandas dataframe df_final = pd.DataFrame(columns=[ 'Score', 'stat', 'beta', 'tvalue', 'pvalue', 'pval_bonferroni', 'signi_bonferonni', 'Rsquare', 'std' ]) db = pd.read_csv(file) ## Standarized scores # scaler = StandardScaler() # for var in ['age', 'age_at_chirurgie']: # db[var] = scaler.fit_transform(db[var]) # Get rid of rows with null values for given columns db = db[db[score].notnull()] # Select Variables Y = np.array(db[score]) X = np.array(db[ind_var]) # Cross validation GLM LOOCV """tras = train accuracy test; teas=test accuray set""" kf = KFold(Y.shape[0], n_folds=Y.shape[0]) predictions = [] rsquares = [] tras = [] confus = [] cm_shape_max = int(np.max(db[score]) + 1) for train_index, test_index in kf: olsmodel = sm.OLS(Y[train_index], X[train_index]) results = olsmodel.fit() pred = np.dot(X[train_index], results.params) pred = np.round(pred) pred[pred < 0] = 0 # No kids had more than 5 in P2 # pred[pred > 5] = 5 ta = np.sum(Y[train_index] == pred) / float(len(Y[train_index])) tras.append(ta) cm = confusion_matrix(Y[train_index], pred) cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] cm[isnan(cm)] = 0 if cm.shape[0] == cm_shape_max: confus.append(cm) rsquares.append(results.rsquared) prediction = np.dot(X[test_index], results.params) predictions.append(prediction) predictions = np.ravel(predictions) confus = np.mean(confus, axis=0) plot_confusion_matrix(confus, title="Mean confusion matrix_" + stat + "_for_" + score) plt.savefig( os.path.join(stat, score, 'Mean_confusion_matrix_' + stat + "_" + score + ".png")) plt.close() predictions = np.round(predictions) predictions[predictions < 0] = 0 #predictions[predictions>5]=5 cvrsq = 1 - (np.sum((Y - predictions)**2) / np.sum((Y - np.mean(Y))**2)) # print(stat) # print(score) # print(cvrsq ) # rsquares = np.ravel(rsquares) # print(rsquares) # plt.scatter(predictions, Y) # plt.plot([min(Y), max(Y)], [min(Y), max(Y)]) # plt.xlabel( " time of day prediction for"+" "+ stat) # plt.ylabel("time of day score for"+ " " + stat) # plt.title("Cross validation Rsquare"+ str(cvrsq)) # plt.savefig(stat+ ".png") # plt.close() #Compute confusion matrix cm = confusion_matrix(Y, predictions) np.set_printoptions(precision=2) print('Confusion matrix, without normalization') print(cm) teas = np.sum(Y == predictions) / float(len(Y)) somme = [] diagnonal = np.diagonal(cm) for i in range(len(diagnonal)): somme.append((diagnonal[i] / np.sum(cm[:, i])) * 100) category = np.array(somme) NanValue = isnan(category) category[NanValue] = 0 # # plt.figure() # # plot_confusion_matrix(cm, title='confusion matrix_'+ stat + "_"+ score) # # plt.savefig(os.path.join(stat, score,"confusion_matrix_" + stat +"_"+ score + ".png")) # # plt.close() # # Normalized confusion matrix # cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] # print('Normalized confusion matrix') # print(cm_normalized) # plt.figure() # plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix_' + stat + "_"+ score) # plt.savefig(os.path.join(stat, score, 'Normalized_confusion_matrix' + stat +"_"+ score + "_.png")) # plt.close() # RUN GLM model = sm.OLS(Y, X).fit() pvals = model.pvalues pvals_fwer = multicomp.multipletests(pvals, alpha=0.05, method='fdr_bh') #Save it into csv file df_final.loc[len(df_final)] = [ score, stat, model.params, model.tvalues, model.pvalues, pvals_fwer[1], pvals_fwer[0], model.rsquared, model.bse ] df_final.to_csv( os.path.join(stat, score, score + "_" + stat + "_" + Level + ".csv")) #check quickly if there is significant data for idx, i in enumerate(model.pvalues): if model.pvalues[i] < 0.05: print(score + " " + stat + " " + Level + ind_var[idx]) print(model.pvalues[idx]) betas_component = model.params[0:betas] ### PLOT the T SCORES # Select the variable y = model.tvalues x = np.array(range(len(ind_var))) # plot figure fig = plt.figure() ax = fig.add_subplot(111) width = 0.35 ## the bars rec = ax.bar(x, y, width, color='green') plt.subplots_adjust(bottom=0.45) plt.xticks(x, ind_var, rotation='vertical') plt.ylabel(score + "_" + stat) plt.xlabel("Rsquare %s" % (model.rsquared)) rects = rec.patches # Plot the pvalues labels = ["p = %f" % i for i in model.pvalues] for rect, label in zip(rects, labels): height = rect.get_height() ax.text(rect.get_x() + rect.get_width() / 2, height + 2, label, ha='center', va='bottom', weight='light', size='xx-small') plt.savefig( os.path.join(stat, score, score + "_" + stat + "_" + brain_type + "_" + ".png")) plt.close() return df_final, db, betas_component, pvals, cvrsq, tras, category, teas