def make_anova_context(df, cols, title, fn, cid): print("\tMAKING ANOVA") fn = fn + "-" + str(cid) title_label = CROSS_TITLE[title] SIGNIFICANCE_CUTOFF = .4 anova_text = title + "\n" # print("ANOVA FOR ") # print(analysis_label) # print(df[analysis_label]) # print(df_col) # df_col.columns == ['variable', 'value'] # val_min = df_col['value'].get(df_col['value'].idxmin()) # val_max = df_col['value'].get(df_col['value'].idxmax()) # homogenous_data = (val_min == val_max) homogenous_data = False # bx = sns.boxplot(x="value", y="question", data=df, palette=custom_palette) if not homogenous_data: aov = pg.anova(dv='value', between='question', data=df) #, subject='ResponseId') aov.round(3) anova_text = anova_text + str(aov) aov.to_csv(FILENAME_ANOVAS + fn + '-anova.csv') p_val = aov['p-unc'][0] print("\t\t" + title) print("\t\t" + 'Across contexts:' + "->" + " p=" + str(p_val)) # if p_chair < SIGNIFICANCE_CUTOFF: # print("Chair position is significant for " + analysis_label + ": " + str(p_chair)) # # print(title) # if p_path_method < SIGNIFICANCE_CUTOFF: # print("Pathing method is significant for " + analysis_label + ": " + str(p_path_method)) # # print(title) # anova_text = anova_text + "\n" # Verify that subjects is legit # print(df[subject_id]) posthocs = df.pairwise_ttests(dv='value', between='question', padjust='bonf') # pg.print_table(posthocs) anova_text = anova_text + "\n" + str(posthocs) posthocs.to_csv(FILENAME_ANOVAS + fn + '-posthocs.csv') else: print("! Issue creating ANOVA for " + analysis_label) print("Verify that there are at least a few non-identical values recorded") anova_text = anova_text + "Column homogenous with value " + str(val_min) f = open(FILENAME_ANOVAS + fn + "-anova.txt", "w") f.write(anova_text) f.close()
def btn_clk11(self): aov = pg.anova(data=self.df4, dv='AUC', between='groupe', detailed=True).round(6) aov_res = pd.DataFrame(aov.T).transpose() model = DataFrameModel(aov_res) self.tableView.setModel(model) self.tableView.resizeColumnsToContents()
def anova_eta2(df, ax=None, skip_assert=False): """Compute the eta^2 statistic for an ANOVA using the ranking data""" z = df.stack().reset_index() z = z.rename(columns={0: 'Rank'}) aov = pg.anova(dv='Rank', between=['Group', 'OdorName'], data=z, detailed=True, effsize='n2').set_index('Source') if not skip_assert: assert aov.loc['Group', 'n2'] < 0.0001 aov = aov.drop('Group') aov.loc['Residual', 'n2'] = 1 - aov['n2'].sum() aov = aov.rename(index={'Residual': 'Individual', 'Group': 'Culture', 'OdorName': 'Odorant', 'Group * OdorName': 'Culture x Odorant'}) return aov
def aovtables(df, y1, y2): """ :param df: dataframe :param y1: factor 1 :param y2: factor 2 :return: Anova tables (III) testing the influence of y1 and y2 factors on each quantitative variable """ aovlist = [] numlist = df.select_dtypes(exclude=['object']).columns.tolist() for name in numlist: aovlist.append([ name, pg.anova(dv=name, between=[str(y1), str(y2)], data=df, detailed=True) ]) return aovlist
def computeAnovas(dv, between_var, data, adjust_type, effect_size_type, save_dir): import numpy as np import pandas as pd import pingouin as pg from pingouin import pairwise_ttests # compute anova aov = pg.anova(dv=dv, between=between_var, data=data) aov.to_csv(save_dir + '/' + dv + '_anova.csv', index=False) print(aov) # compute pairwise ttests ttests = pg.pairwise_ttests(dv=dv, between=between_var, data=data, padjust=adjust_type, effsize=effect_size_type) ttests.to_csv(save_dir + '/' + dv + '_ttests.csv', index=False) print(ttests)
def intraclass_corr(data=None, targets=None, raters=None, ratings=None, nan_policy='raise'): """Intraclass correlation. Parameters ---------- data : :py:class:`pandas.DataFrame` Long-format dataframe. Data must be fully balanced. targets : string Name of column in ``data`` containing the targets. raters : string Name of column in ``data`` containing the raters. ratings : string Name of column in ``data`` containing the ratings. nan_policy : str Defines how to handle when input contains missing values (nan). `'raise'` (default) throws an error, `'omit'` performs the calculations after deleting target(s) with one or more missing values (= listwise deletion). .. versionadded:: 0.3.0 Returns ------- stats : :py:class:`pandas.DataFrame` Output dataframe: * ``'Type'``: ICC type * ``'Description'``: description of the ICC * ``'ICC'``: intraclass correlation * ``'F'``: F statistic * ``'df1'``: numerator degree of freedom * ``'df2'``: denominator degree of freedom * ``'pval'``: p-value * ``'CI95%'``: 95% confidence intervals around the ICC Notes ----- The intraclass correlation (ICC, [1]_) assesses the reliability of ratings by comparing the variability of different ratings of the same subject to the total variation across all ratings and all subjects. Shrout and Fleiss (1979) [2]_ describe six cases of reliability of ratings done by :math:`k` raters on :math:`n` targets. Pingouin returns all six cases with corresponding F and p-values, as well as 95% confidence intervals. From the documentation of the ICC function in the `psych <https://cran.r-project.org/web/packages/psych/psych.pdf>`_ R package: - **ICC1**: Each target is rated by a different rater and the raters are selected at random. This is a one-way ANOVA fixed effects model. - **ICC2**: A random sample of :math:`k` raters rate each target. The measure is one of absolute agreement in the ratings. ICC1 is sensitive to differences in means between raters and is a measure of absolute agreement. - **ICC3**: A fixed set of :math:`k` raters rate each target. There is no generalization to a larger population of raters. ICC2 and ICC3 remove mean differences between raters, but are sensitive to interactions. The difference between ICC2 and ICC3 is whether raters are seen as fixed or random effects. Then, for each of these cases, the reliability can either be estimated for a single rating or for the average of :math:`k` ratings. The 1 rating case is equivalent to the average intercorrelation, while the :math:`k` rating case is equivalent to the Spearman Brown adjusted reliability. **ICC1k**, **ICC2k**, **ICC3K** reflect the means of :math:`k` raters. This function has been tested against the ICC function of the R psych package. Note however that contrarily to the R implementation, the current implementation does not use linear mixed effect but regular ANOVA, which means that it only works with complete-case data (no missing values). References ---------- .. [1] http://www.real-statistics.com/reliability/intraclass-correlation/ .. [2] Shrout, P. E., & Fleiss, J. L. (1979). Intraclass correlations: uses in assessing rater reliability. Psychological bulletin, 86(2), 420. Examples -------- ICCs of wine quality assessed by 4 judges. >>> import pingouin as pg >>> data = pg.read_dataset('icc') >>> icc = pg.intraclass_corr(data=data, targets='Wine', raters='Judge', ... ratings='Scores').round(3) >>> icc.set_index("Type") Description ICC F df1 df2 pval CI95% Type ICC1 Single raters absolute 0.728 11.680 7 24 0.0 [0.43, 0.93] ICC2 Single random raters 0.728 11.787 7 21 0.0 [0.43, 0.93] ICC3 Single fixed raters 0.729 11.787 7 21 0.0 [0.43, 0.93] ICC1k Average raters absolute 0.914 11.680 7 24 0.0 [0.75, 0.98] ICC2k Average random raters 0.914 11.787 7 21 0.0 [0.75, 0.98] ICC3k Average fixed raters 0.915 11.787 7 21 0.0 [0.75, 0.98] """ from pingouin import anova # Safety check assert isinstance(data, pd.DataFrame), 'data must be a dataframe.' assert all([v is not None for v in [targets, raters, ratings]]) assert all([v in data.columns for v in [targets, raters, ratings]]) assert nan_policy in ['omit', 'raise'] # Convert data to wide-format data = data.pivot_table(index=targets, columns=raters, values=ratings) # Listwise deletion of missing values nan_present = data.isna().any().any() if nan_present: if nan_policy == 'omit': data = data.dropna(axis=0, how='any') else: raise ValueError("Either missing values are present in data or " "data are unbalanced. Please remove them " "manually or use nan_policy='omit'.") # Back to long-format # data_wide = data.copy() # Optional, for PCA data = data.reset_index().melt(id_vars=targets, value_name=ratings) # Check that ratings is a numeric variable assert data[ratings].dtype.kind in 'bfiu', 'Ratings must be numeric.' # Check that data are fully balanced # This behavior is ensured by the long-to-wide-to-long transformation # Unbalanced data will result in rows with missing values. # assert data.groupby(raters)[ratings].count().nunique() == 1 # Extract sizes k = data[raters].nunique() n = data[targets].nunique() # Two-way ANOVA with np.errstate(invalid='ignore'): aov = anova(dv=ratings, between=[targets, raters], data=data, ss_type=2) # Extract mean squares msb = aov.at[0, 'MS'] msw = (aov.at[1, 'SS'] + aov.at[2, 'SS']) / (aov.at[1, 'DF'] + aov.at[2, 'DF']) msj = aov.at[1, 'MS'] mse = aov.at[2, 'MS'] # Calculate ICCs icc1 = (msb - msw) / (msb + (k - 1) * msw) icc2 = (msb - mse) / (msb + (k - 1) * mse + k * (msj - mse) / n) icc3 = (msb - mse) / (msb + (k - 1) * mse) icc1k = (msb - msw) / msb icc2k = (msb - mse) / (msb + (msj - mse) / n) icc3k = (msb - mse) / msb # Calculate F, df, and p-values f1k = msb / msw df1 = n - 1 df1kd = n * (k - 1) p1k = f.sf(f1k, df1, df1kd) f2k = f3k = msb / mse df2kd = (n - 1) * (k - 1) p2k = f.sf(f2k, df1, df2kd) # Create output dataframe stats = { 'Type': ['ICC1', 'ICC2', 'ICC3', 'ICC1k', 'ICC2k', 'ICC3k'], 'Description': ['Single raters absolute', 'Single random raters', 'Single fixed raters', 'Average raters absolute', 'Average random raters', 'Average fixed raters'], 'ICC': [icc1, icc2, icc3, icc1k, icc2k, icc3k], 'F': [f1k, f2k, f2k, f1k, f2k, f2k], 'df1': n - 1, 'df2': [df1kd, df2kd, df2kd, df1kd, df2kd, df2kd], 'pval': [p1k, p2k, p2k, p1k, p2k, p2k] } stats = pd.DataFrame(stats) # Calculate confidence intervals alpha = 0.05 # Case 1 and 3 f1l = f1k / f.ppf(1 - alpha / 2, df1, df1kd) f1u = f1k * f.ppf(1 - alpha / 2, df1kd, df1) l1 = (f1l - 1) / (f1l + (k - 1)) u1 = (f1u - 1) / (f1u + (k - 1)) f3l = f3k / f.ppf(1 - alpha / 2, df1, df2kd) f3u = f3k * f.ppf(1 - alpha / 2, df2kd, df1) l3 = (f3l - 1) / (f3l + (k - 1)) u3 = (f3u - 1) / (f3u + (k - 1)) # Case 2 fj = msj / mse vn = df2kd * ((k * icc2 * fj + n * (1 + (k - 1) * icc2) - k * icc2))**2 vd = df1 * k**2 * icc2**2 * fj**2 + \ (n * (1 + (k - 1) * icc2) - k * icc2)**2 v = vn / vd f2u = f.ppf(1 - alpha / 2, n - 1, v) f2l = f.ppf(1 - alpha / 2, v, n - 1) l2 = n * (msb - f2u * mse) / (f2u * (k * msj + (k * n - k - n) * mse) + n * msb) u2 = n * (f2l * msb - mse) / (k * msj + (k * n - k - n) * mse + n * f2l * msb) # Round the confidence intervals def list_round(x, decimals=2): for i, xi in enumerate(x): x[i] = np.round(xi, decimals).tolist() return x stats['CI95%'] = list_round([[l1, u1], [l2, u2], [l3, u3], [1 - 1 / f1l, 1 - 1 / f1u], [l2 * k / (1 + l2 * (k - 1)), u2 * k / (1 + u2 * (k - 1))], [1 - 1 / f3l, 1 - 1 / f3u]]) return stats
def valiate_etaSq(etaSq): if etaSq < .01: qual = 'Negligible 微乎其微' elif etaSq < .06: qual = 'Small 小有關聯' elif etaSq < .14: qual = 'Medium 有關係但影響不大' else: qual = 'Large 非常相關' return qual dv = "Age" between = "Survived_cate" aov = pg.anova(dv=dv, between=between, data=df_train, detailed=True) etaSq = aov.SS[0] / (aov.SS[0] + aov.SS[1]) print("Q1: 透過數值法計算 Age 和 Survived 是否有相關性? A:連續與離散") print("Eta Squared (η2)結果:%.3f 相關性 %s" % (etaSq, valiate_etaSq(etaSq))) # print("Cramer's Values 結果 ", res.loc[2, 'results'], valiate_etaSq(res.loc[2, 'results'])) dt = 'Survived_cate' between = 'Sex' # step1: 用交叉列連表(contingency table),來整理兩個類別型的資料 contTable = pd.crosstab(df_train[between], df_train[dt]) df = min(contTable.shape[0], contTable.shape[1]) - 1 crosstab, res = researchpy.crosstab(df_train[between], df_train[dt], test='chi-square') print() print("Q2:透過數值法計算 Sex 和 Survived 是否有相關性? A:離散與離散") print("Cramer's 相關性%.3f 結果 %s" %
if df.iat[i, 7] < 0.042296667: df.iat[i, 7] = 1 elif df.iat[i,7] < 0.0673333: df.iat[i, 7] = 2 else: df.iat[i,7] = 3 df["PageR"].replace({1: "baja", 2: "media", 3: "alta"}, inplace= True) #logX = np.log10(df['Mediana']) #df = df.assign(mediana_log=df['Mediana']) #df.drop(['Mediana'], axis= 1, inplace= True) factores=["Grado","CoefA","CentCe","CentCa","Excent","PageR"] plt.figure(figsize=(8, 6)) for i in factores: anova = pg.anova (dv=df["Flujomaximo"], between=i, data=df, detailed=True ) pg._export_table (anova,("ANOVAs"+i+".csv")) ax=sns.boxplot(x=df["Flujomaximo"], y=df[i], data=df, palette="cubehelix") plt.savefig("boxflujo" + i + ".eps", bbox_inches='tight') tukey = pairwise_tukeyhsd(endog = df["Flujomaximo"], groups= df[i], alpha=0.05) tukey.plot_simultaneous(xlabel='Flujo máximo', ylabel=i) plt.vlines(x=49.57,ymin=-0.5,ymax=4.5, color="red") plt.savefig("simultaneous_flujo" + i + ".eps", bbox_inches='tight') print(tukey.summary()) t_csv = open("Tukeyflujo"+i+".csv", 'w') with t_csv: writer = csv.writer(t_csv) writer.writerows(tukey.summary()) plt.show()
pca_data = pd.DataFrame(data=comp, columns=['PC1', 'PC2'], index=X.index) pca_data.index = data_cd56.index pca_data = pd.merge(pca_data, data, left_index=True, right_index=True) # merge on both indices #%% ### STATISTICAL TESTS ### # For subplots D and D we perform a one-way ANOVA to test the null hypothesis that # two or more groups have the same population mean. Here all the samples are independent # as coming from different FCGR3A haplotype and tested only in one condition # !pip install openpyxl ANOVA_top = anova( data=data_adcc, dv='top', # dependent variable between='FCGR3A') # between-subject identifier ANOVA_top.to_excel('../stats/ANOVA_top_figure3.xlsx') ANOVA_top_posthoc = pairwise_tukey(data=data_adcc, dv='top', between='FCGR3A') ANOVA_top_posthoc.to_excel('../stats/ANOVA_top_posthoc_figure3.xlsx') ANOVA_ec50 = anova( data=data_adcc, dv='EC50', # dependent variable between='FCGR3A') # between-subject identifier ANOVA_ec50.to_excel('../stats/ANOVA_ec50_figure3.xlsx') ANOVA_ec50_posthoc = pairwise_tukey(data=data_adcc, dv='EC50', between='FCGR3A')
file = 'E:/03_FORMATED_DATA/BEHAVIOR/Catwalk_Norm_Profiles_Cuff_Sham_Ctrl.xlsx' palette = ['royalblue','0.5','lightcoral'] fig, ax = plt.subplots(1,4, sharex=False, sharey=True) df = pd.read_excel(file, header=0) #Easy first, peak amplitude at D15 postOp15 = df[['post_op_15','Condition']] sn.boxplot(x='Condition',y='post_op_15',data=df,ax=ax[0],palette=['lightcoral','0.5','royalblue']) #sn.swarmplot(x='Condition',y='post_op_15',data=postOp15,ax=ax[0],color='black') #[1] --------------------Stats on PostOp15 peak------------------------------------- postOp15_KW = pg.kruskal(data=postOp15,dv='post_op_15',between='Condition') postOp15_Anova = pg.anova(data=postOp15,dv='post_op_15',between='Condition') print('Analysis of behavioral features') print ('[1]--------------------- Post Op 15 ------------------------') print('Average values') print(postOp15.groupby('Condition').mean()) print('') print('STD') print(postOp15.groupby('Condition').std()) print('') print('Multi condition test') print ('Kruskal Wallis') print (postOp15_KW) if postOp15_KW['p-unc'].values <= 0.05: print ('Post Hoc MWU tests')
print('Percentiles: 25th=%.3f, 75th=%.3f, IQR=%.3f' % (q25, q75, iqr)) # calculate the outlier cutoff cut_off = iqr * 1.5 print(cut_off) lower, upper = q25 - cut_off, q75 + cut_off # identify outliers outliers = [x for x in dataset['rssi'] if x < lower or x > upper] print('Identified outliers: %d' % len(outliers)) # remove outliers outliers_removed = [x for x in dataset['rssi'] if x >= lower and x <= upper] print('Non-outlier observations: %d' % len(outliers_removed)) #two-way Annova test #P value of the tag>significant level reject null hypothesis aov = pg.anova(dv='rssi', between=['Tag', 'Antenna'], data=dataset, detailed=True) print(aov) #number =LabelEncoder() #dataset['Tag'] = number.fit_transform(dataset.Tag) jobs_encoder = LabelBinarizer() jobs_encoder.fit(dataset['Tag']) transformed = jobs_encoder.transform(dataset['Tag']) ohe_df = pd.DataFrame(transformed) dataset = pd.concat([dataset, ohe_df], axis=1).drop(['Tag'], axis=1) X = dataset.iloc[:,4:].values y = dataset.iloc[:,3].values #X=np.reshape(X, (202004, 1)) #y=np.reshape(y,(202004, 1))
def intraclass_corr(data=None, items=None, raters=None, scores=None, ci=.95): """Intra-class correlation coefficient. Parameters ---------- data : pd.DataFrame Dataframe containing the variables items : string Name of column in data containing the items (targets). raters : string Name of column in data containing the raters (scorers). scores : string Name of column in data containing the scores (ratings). ci : float Confidence interval Returns ------- icc : float Intraclass correlation coefficient ci : list Lower and upper confidence intervals Notes ----- The intraclass correlation (ICC) assesses the reliability of ratings by comparing the variability of different ratings of the same subject to the total variation across all ratings and all subjects. The ratings are quantitative (e.g. Likert scale). Shrout and Fleiss (1979) describe six cases of reliability of ratings done by :math:`k` raters on :math:`n` targets. Pingouin only returns ICC1, which consider that each target is rated by a different rater and the raters are selected at random. (This is a one-way ANOVA fixed effects model and is found by (MSB - MSW)/(MSB + (nr - 1) * MSW)). ICC1 is sensitive to differences in means between raters and is a measure of absolute agreement. This function has been tested against the ICC function of the R psych package. References ---------- .. [1] Shrout, P. E., & Fleiss, J. L. (1979). Intraclass correlations: uses in assessing rater reliability. Psychological bulletin, 86(2), 420. .. [2] https://cran.r-project.org/web/packages/psych/psych.pdf .. [3] http://www.real-statistics.com/reliability/intraclass-correlation/ Examples -------- ICC of wine quality assessed by 4 judges. >>> import pingouin as pg >>> data = pg.read_dataset('icc') >>> pg.intraclass_corr(data=data, items='Wine', raters='Judge', ... scores='Scores', ci=.95) (0.727526, array([0.434, 0.927])) """ from pingouin import anova # Check dataframe if any(v is None for v in [data, items, raters, scores]): raise ValueError('Data, items, raters and scores must be specified') assert isinstance(data, pd.DataFrame), 'Data must be a pandas dataframe.' # Check that scores is a numeric variable assert data[scores].dtype.kind in 'fi', 'Scores must be numeric.' # Check that data are fully balanced if data.groupby(raters)[scores].count().nunique() > 1: raise ValueError('Data must be balanced.') # Extract sizes k = data[raters].nunique() # n = data[groups].nunique() # ANOVA and ICC aov = anova(dv=scores, data=data, between=items, detailed=True) icc = (aov.at[0, 'MS'] - aov.at[1, 'MS']) / \ (aov.at[0, 'MS'] + (k - 1) * aov.at[1, 'MS']) # Confidence interval alpha = 1 - ci df_num, df_den = aov.at[0, 'DF'], aov.at[1, 'DF'] f_lower = aov.at[0, 'F'] / f.isf(alpha / 2, df_num, df_den) f_upper = aov.at[0, 'F'] * f.isf(alpha / 2, df_den, df_num) lower = (f_lower - 1) / (f_lower + k - 1) upper = (f_upper - 1) / (f_upper + k - 1) return round(icc, 6), np.round([lower, upper], 3)
pk1 = open('./stats_df_1.pkl', 'rb') pk2 = open('./stats_df_2.pkl', 'rb') df1 = pickle.load(pk1) df2 = pickle.load(pk2) # DF1 # PLOT plt.subplot(121) seaborn.stripplot(y=df1['RMSE'], x=df1['co_contraction_level'], hue=df1['EMG_objective']) # STATS aov = pg.anova(dv='RMSE', between=['EMG_objective', 'co_contraction_level'], data=df1) ptt = pg.pairwise_ttests(dv='RMSE', between=['EMG_objective', 'co_contraction_level'], data=df1, padjust='bonf') pg.print_table(aov.round(3)) pg.print_table(ptt.round(3)) # DF2 # PLOT plt.subplot(122) seaborn.stripplot(y=df2['RMSE'], x=df2['Marker_noise_level_m'], hue=df2['EMG_objective']) aov = pg.anova(dv='RMSE',
y = bins % 2 dataset = pd.DataFrame({'x1': x1, 'x2': x2, 'y': y}) display(dataset) # %% dataset.info() # %% columns = dataset.columns.tolist() target = ['y'] feature_names = [item for item in columns if item not in target] rows = len(dataset) print('All columns: ' + str(columns)) print('Feature names: ' + str(feature_names)) print('Target: ' + str(target)) print('Number of rows: ' + str(rows)) # %% pg.anova(data=dataset, dv='x1', between='y', detailed=True) # %% pg.anova(data=dataset, dv='x2', between='y', detailed=True) # %% X = dataset[['x1']] y = dataset[target] rows = len(X) min_samples_leaf = int(rows * 0.05) classifier = DecisionTreeClassifier(criterion='gini', min_samples_leaf=min_samples_leaf) classifier.fit(X, y) # %% X_columns = X.columns.tolist() text_representation = export_text(classifier, feature_names=X_columns) print(text_representation) # %%
def stats(model, quantity, data, targets, tw, rm, nd): if model == 'absolute': data = data.drop(['NormQuant'], axis=1) data['NormMean'] = data['NormMean'].astype(float) mean = 'NormMean' else: data = data.drop(['rq'], axis=1) data['rqMean'] = data['rqMean'].astype(float) mean = 'rqMean' # prepare data from intermediate dataframe data = data[data['Outliers'].eq(False)] data = data.drop_duplicates(keep='first') # t-test and anova for normally distributed data if nd == 'True': if quantity == 2: # T-Test between 2 groups stats_dfs = pandas.DataFrame() posthoc_dfs = pandas.DataFrame() group = data['Group'].dropna() group = group.drop_duplicates(keep='first').values.tolist() for item in targets: df = data[data['Target Name'].eq(item)] group1 = df[df['Group'].eq(group[0])][mean] group2 = df[df['Group'].eq(group[1])][mean] t_test = ttest(group1, group2, paired=bool(rm)) if rm == 'True': t_test['paired'] = 'TRUE' else: t_test['paired'] = 'FALSE' t_test['Target Name'] = item if stats_dfs is None: stats_dfs = t_test else: stats_dfs = stats_dfs.append(t_test, ignore_index=True) # reformat output table stats_dfs = stats_dfs.rename(columns={ 'cohen-d': 'effect size', 'BF10': 'Bayes factor', 'dof': 'DF' }) cols = [ 'Target Name', 'DF', 'T', 'tail', 'paired', 'p-val', 'effect size', 'power', 'Bayes factor' ] stats_dfs = stats_dfs.reindex(columns=cols) elif quantity >= 3: # ANOVA test stats_dfs = pandas.DataFrame() posthoc_dfs = pandas.DataFrame() # tukey_dfs = pandas.DataFrame() pvals = [] for item in targets: if rm == 'True': # one-way if tw == 'False': # repeated measure anova aov = pg.rm_anova( dv=mean, data=data[data['Target Name'].eq(item)], within='Group', subject='Sample Name', detailed=True) pvals.append(aov['p-unc'][0]) aov = aov.drop([1]) aov['measures'] = ['dependent'] aov['Target Name'] = item # two-way else: aov = pg.rm_anova( dv=mean, data=data[data['Target Name'].eq(item)], within=['Group1', 'Group2'], subject='Sample Name', detailed=True) reject_tw, pval_corr_tw = pg.multicomp(list( aov['p-unc']), alpha=0.05, method='bonf') aov['p-value corrected'] = pval_corr_tw aov['measures'] = ['dependent'] * 3 aov['Target Name'] = [item] * 3 aov.drop(['eps'], axis=1) ph = pairwise_ttests( data=data[data['Target Name'].eq(item)], dv=mean, within='Group', subject='Sample Name', padjust='fdr_bh') ph['Target Name'] = item ph['Test'] = 'T-Test' else: # one-way if tw == 'False': aov = pg.anova(dv=mean, between='Group', data=data[data['Target Name'].eq(item)], detailed=True) pvals.append(aov['p-unc'][0]) aov = aov.drop([1]) aov['measures'] = ['independent'] aov['Target Name'] = item ph = pairwise_ttests( data=data[data['Target Name'].eq(item)], dv=mean, between='Group', padjust='fdr_bh') ph['Test'] = 'T-Test' # two-way else: aov = pg.anova(dv=mean, between=['Group1', 'Group2'], data=data[data['Target Name'].eq(item)], detailed=False) aov = aov.drop([3]) reject_tw, pval_corr_tw = pg.multicomp(list( aov['p-unc']), alpha=0.05, method='bonf') aov['p-value corrected'] = pval_corr_tw aov['measures'] = ['independent'] * 3 aov['Target Name'] = [item] * 3 ph = pairwise_ttests( data=data[data['Target Name'].eq(item)], dv=mean, between=['Group1', 'Group2'], padjust='fdr_bh') ph['Test'] = 'T-Test' ph['Target Name'] = item if stats_dfs is None: stats_dfs = aov else: stats_dfs = stats_dfs.append(aov, ignore_index=True) if posthoc_dfs is None: posthoc_dfs = ph else: posthoc_dfs = posthoc_dfs.append(ph, ignore_index=True) reject, pvals_corr = pg.multicomp(pvals, alpha=0.05, method='bonf') # reformat output tables stats_dfs = stats_dfs.rename(columns={ 'p-unc': 'p-value', 'np2': 'effect size' }) if tw == 'False': stats_dfs['p-value corrected'] = pvals_corr stats_dfs['distribution'] = ['parametric'] * len(targets) stats_dfs['test'] = ['ANOVA'] * len(targets) stats_dfs['statistic'] = ['NA'] * len(targets) else: stats_dfs['distribution'] = ['parametric'] * (len(targets) * 3) stats_dfs['test'] = ['ANOVA'] * (len(targets) * 3) stats_dfs['statistic'] = ['NA'] * (len(targets) * 3) cols = [ 'Target Name', 'Source', 'DF', 'F', 'MS', 'SS', 'p-value', 'p-value corrected', 'measures', 'distribution', 'test', 'statistic', 'effect size' ] stats_dfs = stats_dfs.reindex(columns=cols) if tw == 'False': posthoc_dfs = posthoc_dfs.drop(['Contrast', 'T'], axis=1) else: posthoc_dfs = posthoc_dfs.drop(['T'], axis=1) posthoc_dfs = posthoc_dfs.rename( columns={ 'hedges': 'effect size', 'p-corr': 'p-value corrected', 'p-unc': 'p-value', 'p-adjust': 'correction method', 'BF10': 'Bayes factor', 'dof': 'DF' }) if tw == 'False': cols2 = [ 'Target Name', 'A', 'B', 'DF', 'p-value corrected', 'p-value', 'correction method', 'Paired', 'Parametric', 'Test', 'effect size', 'Bayes factor' ] else: cols2 = [ 'Target Name', 'Contrast', 'Group1', 'A', 'B', 'DF', 'p-value corrected', 'p-value', 'correction method', 'Paired', 'Parametric', 'Test', 'effect size', 'Bayes factor' ] posthoc_dfs = posthoc_dfs.reindex(columns=cols2) # nonparametric tests for not normally distributed data else: if quantity == 2: stats_dfs = pandas.DataFrame() posthoc_dfs = pandas.DataFrame() group = data['Group'].dropna() group = group.drop_duplicates(keep='first').values.tolist() for item in targets: df = data[data['Target Name'].eq(item)] group1 = df[df['Group'].eq(group[0])][mean] group2 = df[df['Group'].eq(group[1])][mean] if rm == 'True': # Mann-Whitney U test test = mannwhitneyu(group1, group2) test = pandas.DataFrame( { 'Target Name': item, 'pvalue': test.pvalue, 'statistic': test.statistic }, index=[0]) else: # Wilcoxon test = wilcoxon(group1, group2) test = pandas.DataFrame( { 'Target Name': item, 'pvalue': test.pvalue, 'statistic': test.statistic }, index=[0]) if stats_dfs is None: stats_dfs = test else: stats_dfs = stats_dfs.append(test, ignore_index=True) elif quantity >= 3: stats_dfs = pandas.DataFrame() posthoc_dfs = pandas.DataFrame() pvals = [] for item in targets: if rm == 'True': # friedman test for repeated measurements df = pg.friedman(dv=mean, within='Group', subject='Sample Name', data=data[data['Target Name'].eq(item)]) pvals.append(df['p-unc'][0]) df['test'] = ['Friedman Q'] df['measures'] = ['dependent'] df = df.rename(columns={'Q': 'statistic'}) df['Target Name'] = item df['DF'] = 'NA' ph = pairwise_ttests( data=data[data['Target Name'].eq(item)], dv=mean, within='Group', subject='Sample Name', padjust='fdr_bh', parametric=False) ph['Target Name'] = item ph['DF'] = 'NA' ph['Bayes factor'] = 'NA' ph['Test'] = 'Wilcoxon' else: # Kruskal-Wallis H test df = pg.kruskal(dv=mean, between='Group', data=data[data['Target Name'].eq(item)]) pvals.append(df['p-unc'][0]) df['test'] = ['Kruskal-Wallis H'] df['measures'] = ['independent'] df = df.rename(columns={'H': 'statistic'}) df['Target Name'] = item df['DF'] = 'NA' ph = pairwise_ttests( data=data[data['Target Name'].eq(item)], dv=mean, between='Group', padjust='fdr_bh', parametric=False) ph['Target Name'] = item ph['DF'] = 'NA' ph['Bayes factor'] = 'NA' ph['Test'] = 'Mann-Whitney U' if stats_dfs is None: stats_dfs = df else: stats_dfs = stats_dfs.append(df, ignore_index=True) if posthoc_dfs is None: posthoc_dfs = ph else: posthoc_dfs = posthoc_dfs.append(ph, ignore_index=True) reject, pvals_corr = pg.multicomp(pvals, alpha=0.05, method='bonf') # reformat output tables stats_dfs = stats_dfs.rename(columns={ 'dof': 'DF', 'p-unc': 'p-value' }) stats_dfs['p-value corrected'] = pvals_corr stats_dfs['distribution'] = ['non-parametric'] * len(targets) stats_dfs['MS'] = ['NA'] * len(targets) stats_dfs['SS'] = ['NA'] * len(targets) stats_dfs['effect size'] = ['NA'] * len(targets) cols = [ 'Target Name', 'DF', 'MS', 'SS', 'p-value', 'p-value corrected', 'measures', 'distribution', 'test', 'statistic', 'effect size' ] stats_dfs = stats_dfs.reindex(columns=cols) posthoc_dfs = posthoc_dfs.drop(['Contrast'], axis=1) posthoc_dfs = posthoc_dfs.rename( columns={ 'hedges': 'effect size', 'p-corr': 'p-value corrected', 'p-unc': 'p-value', 'p-adjust': 'correction method', 'BF10': 'Bayes factor' }) cols2 = [ 'Target Name', 'A', 'B', 'DF', 'p-value corrected', 'p-value', 'correction method', 'Paired', 'Parametric', 'Test', 'effect size', 'Bayes factor' ] posthoc_dfs = posthoc_dfs.reindex(columns=cols2) return stats_dfs, posthoc_dfs
def test_pandas(self): """Test pandas method. """ # Test the ANOVA (Pandas) aov = df.anova(dv='Scores', between='Group', detailed=True) assert aov.equals( pg.anova(dv='Scores', between='Group', detailed=True, data=df)) # Test the Welch ANOVA (Pandas) aov = df.welch_anova(dv='Scores', between='Group') assert aov.equals(pg.welch_anova(dv='Scores', between='Group', data=df)) # Test the repeated measures ANOVA (Pandas) aov = df.rm_anova(dv='Scores', within='Time', subject='Subject', detailed=True) assert aov.equals( pg.rm_anova(dv='Scores', within='Time', subject='Subject', detailed=True, data=df)) # FDR-corrected post hocs with Hedges'g effect size ttests = df.pairwise_ttests(dv='Scores', within='Time', subject='Subject', padjust='fdr_bh', effsize='hedges') assert ttests.equals( pg.pairwise_ttests(dv='Scores', within='Time', subject='Subject', padjust='fdr_bh', effsize='hedges', data=df)) # Test two-way mixed ANOVA aov = df.mixed_anova(dv='Scores', between='Group', within='Time', subject='Subject', correction=False) assert aov.equals( pg.mixed_anova(dv='Scores', between='Group', within='Time', subject='Subject', correction=False, data=df)) # Test parwise correlations corrs = data.pairwise_corr(columns=['X', 'M', 'Y'], method='spearman') corrs2 = pg.pairwise_corr(data=data, columns=['X', 'M', 'Y'], method='spearman') assert corrs['r'].equals(corrs2['r']) # Test partial correlation corrs = data.partial_corr(x='X', y='Y', covar='M', method='spearman') corrs2 = pg.partial_corr(x='X', y='Y', covar='M', method='spearman', data=data) assert corrs['r'].equals(corrs2['r']) # Test partial correlation matrix (compare with the ppcor package) corrs = data.pcorr().round(3) np.testing.assert_array_equal(corrs.iloc[0, :].values, [1, 0.392, 0.06, -0.014, -0.149]) # Now compare against Pingouin's own partial_corr function corrs = data[['X', 'Y', 'M']].pcorr() corrs2 = data.partial_corr(x='X', y='Y', covar='M') assert round(corrs.loc['X', 'Y'], 3) == corrs2.loc['pearson', 'r'] # Test mediation analysis med = data.mediation_analysis(x='X', m='M', y='Y', seed=42, n_boot=500) np.testing.assert_array_equal(med.loc[:, 'coef'].values, [0.5610, 0.6542, 0.3961, 0.0396, 0.3565])
import numpy as np import pandas as pd import seaborn as sns import statsmodels from pingouin import mixed_anova, anova, pairwise_tukey from pingouin import logistic_regression data_merged = pd.read_csv( r'C:\Users\user\Desktop\FOCUS\behavioral\P_Merged_var.csv') ### Fill in Nan values in false alarm and omission error (0) data_merged = data_merged.fillna({'false_alarm': 0, 'om_err': 0}) #data_merged.to_csv(r'C:\Users\user\Desktop\FOCUS\behavioral\P_Merged_var.csv', index = None, header=True) # ANOVA - does correct reaction time differ between blocks? aov_corr_rt = anova(dv='corr_rt', between='blocks', data=data_merged) print(aov_corr_rt) rep_anov_alarm = pg.rm_anova(data=data_merged, dv='false_alarm', within='blocks', subject='participant', detailed=True) # follow-up pairwise comparison pairs_corr_rt = pairwise_tukey(dv='corr_rt', between='blocks', data=data_merged) print(pairs_corr_rt)
data = pd.DataFrame(data=dict_dat) ### Store data as csv data.to_csv(r'data.csv') ### Plot_Data sns.kdeplot(data=one) sns.kdeplot(data=two) sns.kdeplot(data=three) sns.kdeplot(data=np.append(one, [two, three])) res = pg.anova(data=data, dv='data', between='group', ss_type=2, detailed=True, effsize='np2') ### ANOVA m_1, var_1, sd_1 = np.mean(one), np.var(one), np.std(one) m_2, var_2, sd_2 = np.mean(two), np.var(two), np.std(two) m_3, var_3, sd_3 = np.mean(three), np.var(three), np.std(three) m_tot, var_tot = data['data'].mean(), data['data'].var() n_obs = len(one) sq_tot = ((dict_dat['data'] - dict_dat['data'].mean())**2).sum() var_dep = sq_tot / (len(dict_dat['data'])) sqa = 35 * np.sum((m_tot - m_1)**2) + 35 * np.sum( (m_tot - m_2)**2) + 35 * np.sum((m_tot - m_3)**2) sqr = (n_obs - 1) * var_1 + (n_obs - 1) * var_2 + (n_obs - 1) * var_3
for var_ in vars_time: print(f'Normality test (Shapiro), {var_}') print(pg.normality(test_df, dv=var_, group='Group')) # Within group rm ANOVA to determine side preference #Prep df with only Sample and Test phases rm_df = si_raw[si_raw['Phase'] != 'Habituation'] rm_df = pd.melt( rm_df, id_vars=['Subject', 'Group', 'Phase'], value_vars=['Time Object/New Cons Chamber', 'Time Conspecific Chamber'], var_name='Side', value_name='Time') # Run ANOVA samp_anova = pg.anova(data=rm_df[rm_df['Phase'] == 'Sample'], dv='Time', between=['Side', 'Group']) #Save to csv samp_anova.to_csv( '/Users/labc02/Documents/PDCB_data/Behavior/Stats/sample_time_anova.csv') # post hoc test, pairwise_ttests, holm-bonf correction samp_posthoc = pg.pairwise_ttests(data=rm_df[rm_df['Phase'] == 'Sample'], dv='Time', between=['Group', 'Side'], padjust='holm') samp_posthoc.to_csv( '/Users/labc02/Documents/PDCB_data/Behavior/Stats/sample_time_posthoc.csv') samp_posthoc[['Group', 'p-corr']] ph_val = zip(['***', '***', '***', '*'], [6, 55, 102, 155], [120, 100, 120, 120]) for ii, jj, kk in ph_val:
from statsmodels.formula.api import ols import numpy as np import pingouin as pg import seaborn as sns from statsmodels.stats.multicomp import pairwise_tukeyhsd import csv df = pd.read_csv("Datos.csv", index_col=None,usecols=[1,2,3,4,8],dtype={'generador': 'category', 'algoritmo_flujo': 'category','vertices': 'category','aristas': 'category', 'mediana': np.float64} ) logX = np.log1p(df['mediana']) df = df.assign(mediana_log=logX.values) df.drop(['mediana'], axis= 1, inplace= True) factores=["vertices","generador","aristas","algoritmo_flujo"] plt.figure(figsize=(8, 6)) for i in factores: print(rp.summary_cont(df['mediana_log'].groupby(df[i]))) anova = pg.anova (dv='mediana_log', between=i, data=df, detailed=True) pg._export_table (anova,("ANOVA"+i+".csv")) ax=sns.boxplot(x=df["mediana_log"], y=df[i], data=df, palette="Set1") plt.savefig("boxplot_"+ i+".png", bbox_inches='tight') plt.savefig("boxplot_" + i + ".eps", bbox_inches='tight') tukey = pairwise_tukeyhsd(endog = df["mediana_log"], groups= df[i], alpha=0.05) tukey.plot_simultaneous(xlabel='Time', ylabel=i) plt.vlines(x=49.57,ymin=-0.5,ymax=4.5, color="red") plt.savefig("simultaneous_tukey"+ i+".png", bbox_inches='tight') plt.savefig("simultaneous_tukey" + i + ".eps", bbox_inches='tight') print(tukey.summary()) t_csv = open("Tukey"+i+".csv", 'w') with t_csv: writer = csv.writer(t_csv) writer.writerows(tukey.summary()) plt.show()
# Q1. Does the condition affect the marker value? # One way ANOVA Value ~ Protocol with PH value | protocol marker_test_dir = save_test_dir / "01_markers" if not os.path.exists(marker_test_dir): os.mkdir(marker_test_dir) marker_ph_dict = {} for marker_label, marker_df in zip(marker_dict.keys(), marker_dict.values()): print(marker_label) # run anova curr_anova_marker = pg.anova( dv=dep_var, between=condition_col, data=marker_df ) pg.print_table(curr_anova_marker) curr_ph_marker = pg.pairwise_tukey( dv=dep_var, between=condition_col, data=marker_df ) pg.print_table(curr_ph_marker) marker_ph_dict[marker_label] = curr_ph_marker # save the files label_test_dir = marker_test_dir / marker_label if not os.path.exists(label_test_dir):
for year, df in metrics.groupby(['Year']): for level, data in df.groupby('type_cat'): pubs_list[f'{year}_{level}'] = list(data['pubs_awarded']) fwci_list[f'{year}_{level}'] = list(data['fwci_awarded']) # Generate separate dataframes pubs = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in pubs_list.items()])) fwci = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in fwci_list.items()])) FileHandling.df_to_excel(data_frames=[pubs, fwci], sheetnames=['pubs', 'fwci'], output_path=f'{output_folder}metrics_per_year.xlsx') # Collect cols for each level levels = ['_1', '_2', '_3'] for level in levels: cols = [col for col in pubs.columns.tolist() if level in col] test_df = pubs[cols].melt(value_name='publications', var_name='Group') # Two-way ANOVA aov = pg.anova(data=test_df, dv='publications', between='Group', export_filename=f'{output_folder}anova_pubs{level}.csv') pg.print_table(aov) # FDR-corrected post hocs with Hedges'g effect size posthoc = pg.pairwise_ttests(data=test_df, dv='publications', between='Group', within=None, parametric=True, alpha=.05, tail='two-sided', padjust='bonf', effsize='none', return_desc=False, export_filename=f'{output_folder}bonf_pubs{level}.csv') # Pretty printing of table pg.print_table(posthoc, floatfmt='.3f')
print(df["Excentricidad"]) logX = np.log1p(df['Mediana']) df = df.assign(mediana_log=logX.values) df.drop(['Mediana'], axis=1, inplace=True) factores = [ "Grado", "CoefAg", "CentCer", "CentCag", "Excentricidad", "PageRag" ] plt.figure(figsize=(8, 6)) for i in factores: print(rp.summary_cont(df['FlujoMax'].groupby(df[i]))) anova = pg.anova( dv='FlujoMax', between=i, data=df, detailed=True, ) pg._export_table(anova, ("ANOVAsFlujoMax" + i + ".csv")) ax = sns.boxplot(x=df["FlujoMax"], y=df[i], data=df, palette="cubehelix") plt.savefig("boxplot_FlujoMax" + i + ".eps", bbox_inches='tight') tukey = pairwise_tukeyhsd(endog=df["FlujoMax"], groups=df[i], alpha=0.05) tukey.plot_simultaneous(xlabel='Flujo Maximo', ylabel=i) plt.savefig("simultaneous_tukey" + i + ".eps", bbox_inches='tight') print(tukey.summary()) t_csv = open("TukeyFlujoMax" + i + ".csv", 'w')
def intraclass_corr(data=None, groups=None, raters=None, scores=None, ci=.95): """Intra-class correlation coefficient. Parameters ---------- data : pd.DataFrame Dataframe containing the variables groups : string Name of column in data containing the groups. raters : string Name of column in data containing the raters (scorers). scores : string Name of column in data containing the scores (ratings). ci : float Confidence interval Returns ------- icc : float Intraclass correlation coefficient ci : list Lower and upper confidence intervals Notes ----- The intraclass correlation (ICC) assesses the reliability of ratings by comparing the variability of different ratings of the same subject to the total variation across all ratings and all subjects. The ratings are quantitative (e.g. Likert scale). Translated in Python from: http://www.real-statistics.com/reliability/intraclass-correlation/ Examples -------- 1. ICC of wine quality assessed by 4 judges. >>> from pingouin.datasets import read_dataset >>> from pingouin import intraclass_corr >>> data = read_dataset('icc') >>> intraclass_corr(data, 'Wine', 'Judge', 'Scores') (0.727525596259691, array([0.434, 0.927])) """ from pingouin import anova from scipy.stats import f # Check dataframe if any(v is None for v in [data, groups, raters, scores]): raise ValueError('Data, groups, raters and scores must be specified') if not isinstance(data, pd.DataFrame): raise ValueError('Data must be a pandas dataframe.') # Check that scores is a numeric variable if data[scores].dtype.kind not in 'fi': raise ValueError('Scores must be numeric.') # Check that data are fully balanced if data.groupby(raters)[scores].count().unique().size > 1: raise ValueError('Data must be balanced.') # Extract sizes k = data[raters].unique().size # n = data[groups].unique().size # ANOVA and ICC aov = anova(dv=scores, data=data, between=groups, detailed=True) icc = (aov.loc[0, 'MS'] - aov.loc[1, 'MS']) / \ (aov.loc[0, 'MS'] + (k - 1) * aov.loc[1, 'MS']) # Confidence interval alpha = 1 - ci df_num, df_den = aov.loc[0, 'DF'], aov.loc[1, 'DF'] f_lower = aov.loc[0, 'F'] / f.isf(alpha / 2, df_num, df_den) f_upper = aov.loc[0, 'F'] * f.isf(alpha / 2, df_den, df_num) lower = (f_lower - 1) / (f_lower + k - 1) upper = (f_upper - 1) / (f_upper + k - 1) return icc, np.round([lower, upper], 3)
# Statistical analysis on excitations with an Anova method df_stats = pd.DataFrame({ "RMSE": RMSEtrack, "co_contraction_level": co_lvl_df, "EMG_objective": EMG_n_lvl_df, "Marker_noise_level_m": marker_n_lvl_df, "component": states_controls_df, "weight_level": weight_lvl_df, }) df_stats = df_stats[df_stats["component"] == "force"] df_stats = df_stats[df_stats["weight_level"] == "high"] df_stats = df_stats[df_stats["RMSE"].notna()] df_stats.to_pickle("stats_df_1.pkl") aov = pg.anova(dv="RMSE", between=["EMG_objective", "co_contraction_level"], data=df_stats) ptt = pg.pairwise_ttests( dv="RMSE", between=[ "co_contraction_level", "EMG_objective", ], data=df_stats, padjust="bonf", ) pg.print_table(aov.round(3)) pg.print_table(ptt.round(3)) # Figure of RMSE on force function of co-contraction level (Fig. 7) import matplotlib
== "Omnibus test of normality" else "shapiro") if y_var2 == "None": st.write(normality) if y_var2 == "None": st.success("Levene test for homoscedasticity of variances") homoscedasticity = pg.homoscedasticity(df, dv=x_var, group=y_var) if y_var2 == "None": st.write(homoscedasticity) if classic_vs_welch == "Classic ANOVA": if y_var2 == "None": anova = pg.anova(dv=x_var, between=y_var, data=df, detailed=True) st.success("Classic ANOVA results") else: anova = df.anova(dv=x_var, between=[y_var, y_var2]) st.success("Two-way ANOVA results") st.write(anova.round(3)) if y_var2 == "None": st.success("Tukey HSD multiple comparisons") tukey_mult = df.pairwise_tukey(dv=x_var, between=y_var).round(3) st.write(tukey_mult) if y_var2 != "None": st.success("Tukey HSD multiple comparisons")
Q_data_raw = pd.read_csv(r'C:\Users\de_hauk\PowerFolders\EXPRA_Peper_SS_2020\Daten\fragebogen\fb_exp.txt', sep='\t').sort_values('ID').set_index('ID').drop(labels='group', axis =1 ) #BSI_dat_fin = pd.concat([BSI_dat, Q_data_raw], axis=0) BSI_dat_fin = pd.merge(BSI_dat, Q_data_raw, left_index=True, right_index=True) #### Save Data #BSI BSI_dat_fin.to_csv(r'C:\Users\de_hauk\PowerFolders\EXPRA_Peper_SS_2020\Daten\data_JASP_2_BSI.csv', sep=',' ) # trial_type/ 2-Faktor anova_2f_bs = AAA_dat.copy()[['id','RT_M_bl','trial_type','group']] anova_2f_bs.to_csv(r'C:\Users\de_hauk\PowerFolders\EXPRA_Peper_SS_2020\Daten\2f_anova\anova_2f_bs.csv', sep=',' ) aov_2F = pg.anova(dv='BSI_RT_BL', between='group', data=BSI_dat, detailed=True) #### Plots sns_plot_BSI = sns.catplot(x="group", y="BSI_RT_BL", data=BSI_dat, kind = 'box') sns_plot_BSI.savefig(r"C:\Users\de_hauk\PowerFolders\EXPRA_Peper_SS_2020\Daten\2f_anova\1F_dat_BSI.png") plt.clf() sns_plot_tt = sns.catplot(x='trial_type', y="RT_M_bl", data=AAA_dat, hue = "group", kind = 'box') sns_plot_tt.savefig(r"C:\Users\de_hauk\PowerFolders\EXPRA_Peper_SS_2020\Daten\2f_anova\2F_dat_tt.png") anova_2F1 = pg.anova(dv='RT_M_bl', between=['group','trial_type'], data=AAA_dat,detailed=True) anova_2F1_ss_tot = ((AAA_dat['RT_M_bl']-AAA_dat['RT_M_bl'].mean())**2).sum() anova_2F1_np_2_group = 2365/(2365 + 308246)
def test_pandas(self): """Test pandas method. """ # Test the ANOVA (Pandas) aov = df.anova(dv='Scores', between='Group', detailed=True) assert aov.equals( pg.anova(dv='Scores', between='Group', detailed=True, data=df)) aov3_ss1 = df_aov3.anova(dv='Cholesterol', between=['Sex', 'Drug'], ss_type=1) aov3_ss2 = df_aov3.anova(dv='Cholesterol', between=['Sex', 'Drug'], ss_type=2) aov3_ss2_pg = pg.anova(dv='Cholesterol', between=['Sex', 'Drug'], data=df_aov3, ss_type=2) assert not aov3_ss1.equals(aov3_ss2) assert aov3_ss2.round(3).equals(aov3_ss2_pg.round(3)) # Test the Welch ANOVA (Pandas) aov = df.welch_anova(dv='Scores', between='Group') assert aov.equals(pg.welch_anova(dv='Scores', between='Group', data=df)) # Test the ANCOVA aov = df_anc.ancova(dv='Scores', covar='Income', between='Method').round(3) assert (aov.equals( pg.ancova(data=df_anc, dv='Scores', covar='Income', between='Method').round(3))) # Test the repeated measures ANOVA (Pandas) aov = df.rm_anova(dv='Scores', within='Time', subject='Subject', detailed=True) assert (aov.equals( pg.rm_anova(dv='Scores', within='Time', subject='Subject', detailed=True, data=df))) # FDR-corrected post hocs with Hedges'g effect size ttests = df.pairwise_tests(dv='Scores', within='Time', subject='Subject', padjust='fdr_bh', effsize='hedges') assert (ttests.equals( pg.pairwise_tests(dv='Scores', within='Time', subject='Subject', padjust='fdr_bh', effsize='hedges', data=df))) # Pairwise Tukey tukey = df.pairwise_tukey(dv='Scores', between='Group') assert tukey.equals( pg.pairwise_tukey(data=df, dv='Scores', between='Group')) # Test two-way mixed ANOVA aov = df.mixed_anova(dv='Scores', between='Group', within='Time', subject='Subject', correction=False) assert (aov.equals( pg.mixed_anova(dv='Scores', between='Group', within='Time', subject='Subject', correction=False, data=df))) # Test parwise correlations corrs = data.pairwise_corr(columns=['X', 'M', 'Y'], method='spearman') corrs2 = pg.pairwise_corr(data=data, columns=['X', 'M', 'Y'], method='spearman') assert corrs['r'].equals(corrs2['r']) # Test partial correlation corrs = data.partial_corr(x='X', y='Y', covar='M', method='spearman') corrs2 = pg.partial_corr(x='X', y='Y', covar='M', method='spearman', data=data) assert corrs['r'].equals(corrs2['r']) # Test partial correlation matrix (compare with the ppcor package) corrs = data.iloc[:, :5].pcorr().round(3) np.testing.assert_array_equal(corrs.iloc[0, :].to_numpy(), [1, 0.392, 0.06, -0.014, -0.149]) # Now compare against Pingouin's own partial_corr function corrs = data[['X', 'Y', 'M']].pcorr() corrs2 = data.partial_corr(x='X', y='Y', covar='M') assert np.isclose(corrs.at['X', 'Y'], corrs2.at['pearson', 'r']) # Test rcorr (correlation matrix with p-values) # We compare against Pingouin pairwise_corr function corrs = df_corr.rcorr(padjust='holm', decimals=4) corrs2 = df_corr.pairwise_corr(padjust='holm').round(4) assert corrs.at['Neuroticism', 'Agreeableness'] == '*' assert (corrs.at['Agreeableness', 'Neuroticism'] == str(corrs2.at[2, 'r'])) corrs = df_corr.rcorr(padjust='holm', stars=False, decimals=4) assert (corrs.at['Neuroticism', 'Agreeableness'] == str(corrs2.at[2, 'p-corr'].round(4))) corrs = df_corr.rcorr(upper='n', decimals=5) corrs2 = df_corr.pairwise_corr().round(5) assert corrs.at['Extraversion', 'Openness'] == corrs2.at[4, 'n'] assert corrs.at['Openness', 'Extraversion'] == str(corrs2.at[4, 'r']) # Method = spearman does not work with Python 3.5 on Travis? # Instead it seems to return the Pearson correlation! df_corr.rcorr(method='spearman') df_corr.rcorr() # Test mediation analysis med = data.mediation_analysis(x='X', m='M', y='Y', seed=42, n_boot=500) np.testing.assert_array_equal(med.loc[:, 'coef'].round(4).to_numpy(), [0.5610, 0.6542, 0.3961, 0.0396, 0.3565])
ANOVApvals = np.empty((npatients)) allHstat = np.empty((npatients)) KWpvals = np.empty((npatients)) for p in range(npatients): print(f'Loading summary delta data for Patient {p+1}') summarydelta = sio.loadmat(f'data/Patient{p+1}_summarydelta') stackeddelta = np.hstack((np.squeeze(summarydelta['state1delta']), np.squeeze(summarydelta['state2delta']))) whichstate = np.ones(stackeddelta.shape[0]) * 2 whichstate[0:(np.squeeze(summarydelta['state1delta']).size)] = 1 delta_df = pd.DataFrame({ 'standarddelta': stackeddelta, 'brainstate': whichstate }) aov = anova(dv='standarddelta', between='brainstate', data=delta_df, detailed=True) allFstat[p] = aov['F'][0] ANOVApvals[p] = aov['p-unc'][0] kw = kruskal(dv='standarddelta', between='brainstate', data=delta_df, detailed=True) allHstat[p] = kw['H'][0] KWpvals[p] = kw['p-unc'][0] nANOVAsig001 = np.sum(ANOVApvals < .001) print( f'There were {nANOVAsig001} significant differences by ANOVA (alpha = .001) of {npatients} patients between mean standardized delta across both brain-derived states' ) nANOVAsig01 = np.sum(ANOVApvals < .01)
aov_table # %% [markdown] # The output from this command provides us with two things. First, it shows us the result of a t-test for each of the dummy variables, which basically tell us whether each of the conditions separately differs from placebo; it appears that Drug 1 does whereas Drug 2 does not. However, keep in mind that if we wanted to interpret these tests, we would need to correct the p-values to account for the fact that we have done multiple hypothesis tests; we will see an example of how to do this in the next chapter. # # Remember that the hypothesis that we started out wanting to test was whether there was any difference between any of the conditions; we refer to this as an *omnibus* hypothesis test, and it is the test that is provided by the F statistic. The F statistic basically tells us whether our model is better than a simple model that just includes an intercept. In this case we see that the F test is highly significant, consistent with our impression that there did seem to be differences between the groups (which in fact we know there were, because we created the data). # %% ols_model = ols(formula='BPsys~ group', data=df) ols_result = ols_model.fit() aov_table = sm.stats.anova_lm(ols_result) aov_table # %% import pingouin as pg pg.anova(data=df, dv='BPsys',between='group', effsize="np2") # %% pg.pairwise_tukey(data=df, dv='BPsys', between='group') # %% [markdown] # ## Learning objectives # # After reading this chapter, you should be able to: # # * Describe the rationale behind the sign test # * Describe how the t-test can be used to compare a single mean to a hypothesized value # * Compare the means for two paired or unpaired groups using a two-sample t-test # # # ## Appendix