def loaddata(data1, data2, parameter, name1, name2, rsmpls=None): temp = pd.concat([data1[parameter], data2[parameter]], axis=1, sort=True) temp.columns = [name1, name2] if rsmpls is not None: bootstrap = dabest.load(temp, idx=(name1, name2), resamples=rsmpls) else: bootstrap = dabest.load(temp, idx=(name1, name2)) return bootstrap
def esci_indep_cohens_d(data1, data2, n_boot=5000, has_preds=False): '''Compute Cohen's d effect size and its bootstrap 95% confidence interval. (using bias corrected accelerated bootstrap). Parameters ---------- data1 : np.ndarray One dimensional array of values for the "high" group (for example diagnosed participants). data2 : np.ndarray One dimensional array of values for the "low" group (for example healthy controls). n_boot : int Number of bootstraps to use. has_preds : bool Wheter array of predictors is provided in the data. If so the first column of data1 and data2 are data for separate groups and the following columns are the predictors used in regression with the predictor of interest (group membership) being the last one and the rest treated as confounds. Returns ------- stats : dict Dictionary of results. * ``stats['es']`` contains effect size. * ``stats['ci']`` contains 95% confidence interval for the effect size. * ``stats['bootstraps']`` contains bootstrap effect size values. ''' if not has_preds: assert data2 is not None import dabest df = utils.psd_to_df(data1, data2) dbst_set = dabest.load(df, idx=("controls", "diagnosed"), x="group", y="FAA", resamples=n_boot) results = dbst_set.cohens_d.results cohen_d = results.difference.values[0] cohen_d_ci = (results.bca_low.values[0], results.bca_high.values[0]) bootstraps = results.bootstraps[0] else: from borsar.stats import compute_regression_t import scikits.bootstrap as boot def regression_Cohens_d(data1, data2): data = np.concatenate([data1, data2], axis=0) preds = data[:, 1:] tvals = compute_regression_t(data[:, [0]], preds) return d_from_t_categorical(tvals[-1, 0], preds) cohen_d = regression_Cohens_d(data1, data2) cohen_d_ci, bootstraps = boot.ci((data1, data2), regression_Cohens_d, multi='independent', n_samples=n_boot, return_dist=True) stats = dict(es=cohen_d, ci=cohen_d_ci, bootstraps=bootstraps) return stats
def plotContrasts(self, y, colorBy, compareBy, groupBy='Temperature', plot_kwargs=dict()): resultsDf = self.resultsDf resultsDf[ 'newPlotColumn'] = resultsDf[groupBy] + ' ' + resultsDf[compareBy] listIdx = tuple( np.unique(resultsDf[groupBy])[0] + ' ' + np.unique(resultsDf[compareBy])[::-1]) for i in range(1, len(np.unique(resultsDf[groupBy]))): listIdx = (listIdx, tuple( np.unique(resultsDf[groupBy])[i] + ' ' + np.unique(resultsDf[compareBy])[::-1])) print(listIdx) customPalette = locoPlotters.espressoCreatePalette(resultsDf[colorBy]) setFont('Source Sans Bold', 10) dabestContrastData = dabest.load( resultsDf, x='newPlotColumn', # the default for this test config is to group flies by genotype y=y, idx=listIdx, paired=False) fig = dabestContrastData.mean_diff.plot(color_col=colorBy, custom_palette=customPalette, **plot_kwargs) if len(np.unique(resultsDf[groupBy])) == 1: flatListIdxC = [item.split(' ')[1] for item in listIdx] flatListIdxG = [item.split(' ')[0] for item in listIdx] else: flatListIdxC = [item.split(' ')[1] for t in listIdx for item in t] flatListIdxG = [item.split(' ')[0] for t in listIdx for item in t] fig.axes[0].set_xticklabels(flatListIdxC, rotation=45, ha="right") ylim = fig.axes[0].get_ylim() for i in range(0, len(np.unique(resultsDf[groupBy]))): # fig.axes[0].text(0.5, ylim[1], flatListIdxG[0], ha="center") fig.axes[0].text(0.5 + 2 * i, ylim[1] * 1.1, flatListIdxG[2 * i], ha="center") locoUtilities.espressoSaveFig(fig, y + '_contrast', self.metaDataDf.Date[0], self.outputFolder) return fig
def _parallel_region_dabest(self, Region, Freq): """ Compute Regionwise differences. """ print(f'DABEST on region {Region}, Frequency: {Freq}') df_pivot = self.GBC_df.pivot(index=['Subject', 'Group'], columns='Frequency', values=Region).reset_index() # Bootstrap test with DABEST analysis = dabest.load(df_pivot, idx=("Control", "FEP"), x='Group', y=Freq, ci=90) results = analysis.mean_diff.results # Levene Test _, pval = scipy.stats.levene( df_pivot.loc[df_pivot['Group'] == 'Control', Freq], df_pivot.loc[df_pivot['Group'] == 'Control', Freq]) results['levene-p-value'] = pval # Insert Region Name in result df results.insert(loc=0, column='Region', value=Region) return results
def _parallel_net_dabest(self, Measure, Freq): """ Apply Dabest on Graph Measure, is called in dabest_net_measures. """ print(f'DABEST on Graph Measure {Measure}, Frequency: {Freq}') df_pivot = self.Net_df.pivot(index=['Subject', 'Group'], columns='Frequency', values=Measure).reset_index() # Bootstrap test with DABEST analysis = dabest.load(df_pivot, idx=("Control", "FEP"), x='Group', y=Freq, ci=90) results = analysis.mean_diff.results # Levene Test _, pval = scipy.stats.levene( df_pivot.loc[df_pivot['Group'] == 'Control', Freq], df_pivot.loc[df_pivot['Group'] == 'FEP', Freq]) results['levene-p-value'] = pval # Insert Region Name in result df results.insert(loc=0, column='Measure', value=Measure) return results
def dabest_avg_GBC(self): """ Function to calculate effect size and t/p value for average GBC. """ df_long = pd.read_pickle( self.find(suffix='GBC', filetype='.pkl', Freq=self.Frequencies)) df_wide = pd.pivot_table(df_long, index=['Group', 'Subject'], columns='Frequency', values='Avg. GBC').reset_index() res_list = [] for Freq in self.FrequencyBands.keys(): analysis = dabest.load(df_wide, idx=("Control", "FEP"), x='Group', y=Freq, ci=90) results = analysis.mean_diff.results results.insert(loc=0, column='Frequency', value=Freq) res_list.append(results) result_df = pd.concat(res_list) # Save Pickle FileName = self.createFileName(suffix='Mean-GBC-DABEST', filetype='.pkl', Freq=self.Frequencies) FilePath = self.createFilePath(self.EdgeStatsDir, 'GBC', 'Stats', FileName) result_df.to_pickle(FilePath) # Save CSV FileName = self.createFileName(suffix='Mean-GBC-DABEST', filetype='.csv', Freq=self.Frequencies) FilePath = self.createFilePath(self.EdgeStatsDir, 'GBC', 'Stats', FileName) result_df.to_csv(FilePath)
def plot_massplot(MASS, cluster2MASS): with open(str(MASS)+'_mass.csv', mode='w') as mass_file: mass_writer = csv.writer(mass_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) mass_writer.writerow(list(cluster2MASS.keys())) max_length = 0 for cluster in cluster2MASS: if len(cluster2MASS[cluster])>max_length: max_length = len(cluster2MASS[cluster]) for i in range(max_length): tmp = np.full(len(list(cluster2MASS.keys())), np.nan) for cluster in cluster2MASS: if len(cluster2MASS[cluster]) > i: tmp[cluster] = cluster2MASS[cluster][i] mass_writer.writerow(tmp) # Load the iris dataset. Requires internet access. mass = pd.read_csv(str(MASS)+'_mass.csv') mass = mass.rename(columns={"0": "Cluster 0", "1": "Cluster 1", "2": "Cluster 2", "3": "Cluster 3", "4": "Cluster 4", "5": "Cluster 5"}) #mass['Cluster 0'] = 0 # Load the above data into `dabest`. shared_control = dabest.load(mass, idx=("Cluster 0", "Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5")) # Produce a Cumming estimation plot. shared_control.mean_diff.plot() plt.show()
import matplotlib %matplotlib inline import numpy as np import locoDataMunger import locoUtilities import pandas as pd import locoPlotters import espresso as esp import dabest #%% dataFolder = '/Users/sangyuxu/xy1/' TrhLxR50Gal = EspressoLocomotion.EspressoLocomotion(dataFolder, 0, 120) #%% # resultsDf = TrhCsCh.metaDataDf groupBy = 'Temperature' compareBy = 'Status' colorBy = 'Genotype' dabestContrastData = dabest.load(TrhLxR50Gal.resultsDf, x=compareBy, # the default for this test config is to group flies by genotype y='averageSpeed' , idx=np.unique(TrhLxR50Gal.metaDataDf.Status), paired=False ) fig = dabestContrastData.mean_diff.plot( color_col=colorBy)
print(scipy.stats.levene(EBUSCO_plot_data, Enon_BUSCO_plot_data)) print("2") print(np.var(CBUSCO_plot_data)) print(np.var(Cnon_BUSCO_plot_data)) print(scipy.stats.levene(CBUSCO_plot_data, Cnon_BUSCO_plot_data)) print("3") print(np.var(BBUSCO_plot_data)) print(np.var(Bnon_BUSCO_plot_data)) print(scipy.stats.levene(BBUSCO_plot_data, Bnon_BUSCO_plot_data)) #dabest dict_data = { "BUSCO (2)": pd.Series(CBUSCO_plot_data), "non-BUSCO (2)": pd.Series(Cnon_BUSCO_plot_data), "BUSCO (1)": pd.Series(EBUSCO_plot_data), "non-BUSCO (1)": pd.Series(Enon_BUSCO_plot_data), "BUSCO (3)": pd.Series(BBUSCO_plot_data), "non-BUSCO (3)": pd.Series(Bnon_BUSCO_plot_data) } df = pd.DataFrame(dict_data) multi = dabest.load(df, idx=(("BUSCO (2)", "non-BUSCO (2)"), ("BUSCO (1)", "non-BUSCO (1)"), ("BUSCO (3)", "non-BUSCO (3)"))) (multi.mean_diff.statistical_tests ).to_csv("/FULL/PATH/TO/OUTPUT/DIRECTORY/dabest_BUSCO_comparison.csv")
# ax.set_xlabel('Time') # ax.set_ylabel('Linear Corr.') # ax.set_title('Correlation: recall vs. ISC change') # ax.xaxis.set_major_formatter(FormatStrFormatter('%d')) # ax.legend() # sns.despine() # f.tight_layout() # # xticklabels = [f'RM-{cond}' for cond in has_memory_conds] # f, ax = plt.subplots(1, 1, figsize=(6, 4)) # sns.violinplot(data=[r_mu_tisc[cond] for cond in has_memory_conds]) # ax.axhline(0, color='grey', linestyle='--') # ax.set_xticks(range(len(xticklabels))) # ax.set_xticklabels(xticklabels) # ax.set_xlabel('Condition') # ax.set_ylabel('Linear Correlation') # ax.set_title('Correlation: recall vs. ISC change') # sns.despine() # f.tight_layout() # data_dict = {} for cond in list(r_mu_sisc.keys()): data_dict[f'RM-{cond}'] = np.mean(r_val_tisc[cond], axis=-1) df = make_df(data_dict) db = dabest.load(data=df, x="Condition", y="Value", idx=list(data_dict.keys())) db.mean_diff.plot(swarm_label='Linear correlation', fig_size=(7, 5), custom_palette=c_pal)
del dk[ptest][cond]['er'][i_ms] for lca_pid, lca_pname in lca_pnames.items(): del lca_param[ptest][lca_pid][cond]['mu'][i_ms] del lca_param[ptest][lca_pid][cond]['er'][i_ms] del ma_lca[ptest][i_ms] '''process the data: extract differences between the two penalty conds''' # compute RT rt = {ptest: None for ptest in penaltys_test} time_vector = np.reshape(np.arange(n_param) + 1, (n_param, 1)) for ptest in penaltys_test: ig_p2_ = np.array(lca_param[ptest][0]['DM']['mu'])[:, n_param:].T ig_p2_norm = ig_p2_ / np.sum(ig_p2_, axis=0) rt[ptest] = np.reshape(np.dot(ig_p2_norm.T, time_vector), (-1, )) '''slope graph''' data_dict = {'low': rt[0], 'high': rt[4]} df = pd.DataFrame(data_dict) df['ids'] = np.arange(n_subjs) df.head() # Load the data into dabest dabest_data = dabest.load(data=df, idx=list(data_dict.keys()), paired=True, id_col='ids') dabest_data.mean_diff.plot(swarm_label='Recall time', fig_size=(8, 5), swarm_ylim=[0, 6]) print(dabest_data.mean_diff) dabest_data.mean_diff.statistical_tests
# resultsDf = TrhCsCh.metaDataDf allSpeedData.plotBoundedSpeedLines(colorBy = 'Sex', col = 'Status', rp = '600s') #%% allSpeedData.plotMeanHeatMaps(row = 'Status', col = 'Temperature') #%% groupBy = 'Status' compareBy = 'Temperature' colorBy = 'Genotype' results = allSpeedData.resultsDf.loc[allSpeedData.resultsDf['Status'] == 'Offspring'] allSpeedData dabestContrastData = dabest.load(results, x=compareBy, # the default for this test config is to group flies by genotype y='TB Preference' , idx=np.unique(allSpeedData.metaDataDf.Temperature), paired=False ) fig = dabestContrastData.mean_diff.plot( color_col=colorBy) #%% # put in the parameters you don't want to type over and over again when using the plot function groupby='Sex' compareby='Status' colorby='Genotype' startHour=0 #hours endHour=2#hours figAspectRatio=(8, 5)
idx_list = [] for dtype_ in data_types: idx_tuple_ = ('%s.%s' % (dtype_, exc_lines[0]), '%s.%s' % (dtype_, exc_lines[1])) idx_list.append(idx_tuple_) # %% Cummings plot palette_mod = { comp_type: exc_palette[comp_type.split('.')[-1]] for comp_type in comp_types } sns.set(font_scale=1.2) gene_comp_figname = 'figures/diff_gene_expression_exc.svg' gene_df = dabest.load(exc_expression_melted, idx=idx_list, x="Cre_gene", y='cpm') f = gene_df.cliffs_delta.plot( custom_palette=palette_mod, group_summaries='median_quartiles', swarm_desat=.9, # swarm_ylim=(1e-5,1e-3), swarmplot_kwargs={'size': 2.5}) rawdata_axes = f.axes[0] rawdata_axes = man_utils.annotate_sig_level(data_types, exc_lines, 'Cre_line', gene_sig_grouped, 'Comp_type', exc_expression_melted,
print ("Correlation matrix\n") print (sheet.corr(method='pearson')) print ("----------------------------") fig, (ax1, ax2) = plt.subplots(2, 1) for col in sheet.columns: ax1.plot(sheet[col].values) ax2 = sheet.boxplot() """ From Seaborn """ sns.pairplot(sheet) """ From dabest """ print ("----------------------------") two_groups_unpaired = dabest.load(sheet, idx=(sheet.columns[0], sheet.columns[1]), resamples=5000) two_groups_unpaired.mean_diff.plot() two_groups_unpaired.hedges_g.plot() stat=two_groups_unpaired.mean_diff.statistical_tests print ('Further Statistics from the first 2 columns') print (stat.transpose()) #two_groups_unpaired.mean_diff #two_groups_unpaired.mean_diff.results #two_groups_unpaired.mean_diff.statistical_tests #two_groups_unpaired.hedges_g.results
comp_types = sag_features_all['Cre_type'].unique().tolist() data_types = sag_features_all.type.unique().tolist() idx_list = [] for dtype_ in data_types: idx_tuple_ = ('%s.%s' % (dtype_, exc_lines[0]), '%s.%s' % (dtype_, exc_lines[1])) idx_list.append(idx_tuple_) sns.set(font_scale=1) palette_mod = { comp_type: palette[comp_type.split('.')[-1]] for comp_type in comp_types } analysis_of_long_df = dabest.load(sag_features_all, idx=idx_list, x="Cre_type", y=select_sag_feature) f = analysis_of_long_df.cliffs_delta.plot(custom_palette=palette_mod, group_summaries='median_quartiles', swarm_desat=.9) rawdata_axes = f.axes[0] rawdata_axes = man_utils.annotate_sig_level(sig_vars, exc_lines, 'Cre_line', ephys_sig_group, 'Comp_type', sag_features_all, 'type', select_sag_feature, rawdata_axes,
if type_.split('.')[-1] != 'L5 CF' else subclass_colors['L5 PT'] for type_ in feature_data.feat_ttype.unique() } sns.set(font_scale=1.2) fig, ax = plt.subplots(figsize=(15, 8)) idx_feat = [] for dtype_ in feature_select: idx_tuple_ = ('%s.%s' % (dtype_, exc_subclasses[0]), '%s.%s' % (dtype_, exc_subclasses[1])) idx_feat.append(idx_tuple_) feature_data_select = feature_data.loc[ feature_data.features.isin(feature_select), ] analysis_df_feat = dabest.load(feature_data_select, idx=idx_feat, x='feat_ttype', y='value') f = analysis_df_feat.cliffs_delta.plot(ax=ax, custom_palette=palette_features, group_summaries='median_quartiles', swarm_desat=.9) ax = man_utils.annotate_sig_level(feature_select, exc_subclasses, 'ttype', feat_sig_grouped, 'Comp_type', feature_data_select, 'features', 'value', ax) rawdata_axes = f.axes[0] raw_xticklabels = rawdata_axes.get_xticklabels() labels = [] for label in raw_xticklabels: txt = label.get_text()
dpi=100) channel_select = [ channel_ for channel_ in cond_types if channel in channel_ ] idx_channel = [] for dtype_ in channel_select: idx_tuple_ = ('%s.%s' % (dtype_, exc_subclasses[0]), '%s.%s' % (dtype_, exc_subclasses[1])) idx_channel.append(idx_tuple_) param_data_select = param_data.loc[ param_data.conductance.isin(channel_select), ] analysis_df_channel = dabest.load(param_data_select, idx=idx_channel, x='param_ttype', y='value') analysis_df_channel.cliffs_delta.plot(ax=ax[0], custom_palette=palette_channel, group_summaries='median_quartiles', swarm_desat=.9) ax[0] = man_utils.annotate_sig_level(channel_select, exc_subclasses, 'ttype', cond_sig_grouped, 'Comp_type', param_data_select, 'conductance', 'value', ax[0]) # ax[0].set_title(r'-log(p-val) = %.2f' % -np.log10(cond_p_val)) genes = [gene for gene in genes if gene in gene_types]