def _varAnalysis(df, labels): """ """ from scipy.stats import levene if df.shape[0] != len(labels): raise ValueError( "The number of input samples is not equal to labels size") return 0 label_ = np.unique(labels) groups = _split(df, labels) if len(label_) == 2: print('Performing t-test analysis...') from scipy.stats import ttest_ind F, P = [], [] for i in range(df.shape[1]): sample = [item[:, i] for item in groups] stat, p = levene(*sample) if p < 0.05: f, p = ttest_ind(*sample, equal_var=False) else: f, p = ttest_ind(*sample, equal_var=True) F.append(f) P.append(p) elif len(label_) > 2: print('Performing anova analysis...') F, P = [], [] for i in range(df.shape[1]): sample = [item[:, i] for item in groups] stat, p = levene(*sample) if p < 0.05: from pingouin import welch_anova meta = pd.DataFrame(df.iloc[:, i]) meta.columns = ['feature'] meta['labels'] = labels result = welch_anova(data=meta, dv='feature', between='labels') f = result['F'].values[0] p = result['p-unc'].values[0] else: from scipy.stats import f_oneway f, p = f_oneway(*sample) F.append(f) P.append(p) else: raise ValueError("Groups for comparison are less than 2!") F = pd.DataFrame(F) P = pd.DataFrame(P) F.index = df.columns P.index = df.columns return F, P
def corr_to_target( df : pd.DataFrame, target : Any, cat_features : Any, ) -> pd.DataFrame: """ Determine correlation of target feature to all other features with an effect size that is comparable between categorical and numerical features. Arguments: df : Data target : Target feature key in DataFrame with respect to which correlations are determined cat_features : Keys for categorical features in DataFrame Returns: DataFrame with correlations measures """ result = defaultdict(dict) cat_target = target in cat_features num_target = not cat_target for f in df.columns.drop(target): data = df[[f, target]].dropna() cat_feature = f in cat_features num_feature = not cat_feature result['Categorical'][f] = int(cat_feature) if cat_target and cat_feature: pass elif cat_target and num_feature: pass elif num_target and cat_feature: vc = data[f].value_counts() if vc.min() > 1: n2, pval = pg.welch_anova(data=data, dv=target, between=f).loc[0,['np2','p-unc']].values result['R2'][f] = n2 result['R2_adj'][f] = r2_adjusted(n2,len(data),len(vc)) result['pval'][f] = pval #mi = mutual_info_classif(data[[target]], data[f])[0] #result['MI'][f] = mi elif num_target and num_feature: r, pval = stats.pearsonr(data[f], data[target]) result['R2'][f] = r**2 result['R2_adj'][f] = r2_adjusted(r**2, len(data), 1) result['pval'][f] = pval #mi = mutual_info_regression(data[[f]], data[target])[0] #result['MI'][f] = mi return pd.DataFrame(result)
def welch_f_test(self, stacked_df, dependentvar, groupname): ''' input: stacked_data: a dataframe from table_transform_function dependentvar: value to compare groupname: names of group to split return: perform welch F test to check if there are differences present ''' # alpha is 0.05 p_value = pingouin.welch_anova(dependentvar, groupname, stacked_df)['p-unc'][0] if p_value > 0.05: return 'fail to reject Null Hypothesis' else: return 'reject Null Hypothesis'
epms1_clean, open_out = detec_outlier(df=epms1_clean, var_name='Time in Zone (%) - Open Arms', var_group='Subject Group') open_out epm_s1[epm_s1['Entries in Zone - Center'].between(0, 5)] # Stats Session 1 #Entries is normally distributed, anova entries_anova = pg.anova(data=epms1_clean, dv='Entries in Zone - Center', between='Subject Group') entries_anova.to_csv( '/Users/labc02/Documents/PDCB_data/Behavior/EPM/Stats/cross_center_anova.csv' ) # Total distance is not normal and fail Levenne; Welch anova dist_Welch = pg.welch_anova(data=epms1_clean, dv='Total Distance', between='Subject Group') dist_Welch tdist_ph = pg.pairwise_gameshowell(data=epms1_clean, dv='Total Distance', between='Subject Group') dist_Welch.to_csv( '/Users/labc02/Documents/PDCB_data/Behavior/EPM/Stats/total_dist_Welch.csv' ) tdist_ph.to_csv( '/Users/labc02/Documents/PDCB_data/Behavior/EPM/Stats/total_dist_ph.csv') pg.homoscedasticity(data=epms1_clean, dv='Time in Zone (%) - Open Arms', group='Subject Group') # Open Arms ANOVA
def test_pandas(self): """Test pandas method. """ # Test the ANOVA (Pandas) aov = df.anova(dv='Scores', between='Group', detailed=True) assert aov.equals( pg.anova(dv='Scores', between='Group', detailed=True, data=df)) aov3_ss1 = df_aov3.anova(dv='Cholesterol', between=['Sex', 'Drug'], ss_type=1) aov3_ss2 = df_aov3.anova(dv='Cholesterol', between=['Sex', 'Drug'], ss_type=2) aov3_ss2_pg = pg.anova(dv='Cholesterol', between=['Sex', 'Drug'], data=df_aov3, ss_type=2) assert not aov3_ss1.equals(aov3_ss2) assert aov3_ss2.round(3).equals(aov3_ss2_pg.round(3)) # Test the Welch ANOVA (Pandas) aov = df.welch_anova(dv='Scores', between='Group') assert aov.equals(pg.welch_anova(dv='Scores', between='Group', data=df)) # Test the ANCOVA aov = df_anc.ancova(dv='Scores', covar='Income', between='Method').round(3) assert (aov.equals( pg.ancova(data=df_anc, dv='Scores', covar='Income', between='Method').round(3))) # Test the repeated measures ANOVA (Pandas) aov = df.rm_anova(dv='Scores', within='Time', subject='Subject', detailed=True) assert (aov.equals( pg.rm_anova(dv='Scores', within='Time', subject='Subject', detailed=True, data=df))) # FDR-corrected post hocs with Hedges'g effect size ttests = df.pairwise_tests(dv='Scores', within='Time', subject='Subject', padjust='fdr_bh', effsize='hedges') assert (ttests.equals( pg.pairwise_tests(dv='Scores', within='Time', subject='Subject', padjust='fdr_bh', effsize='hedges', data=df))) # Pairwise Tukey tukey = df.pairwise_tukey(dv='Scores', between='Group') assert tukey.equals( pg.pairwise_tukey(data=df, dv='Scores', between='Group')) # Test two-way mixed ANOVA aov = df.mixed_anova(dv='Scores', between='Group', within='Time', subject='Subject', correction=False) assert (aov.equals( pg.mixed_anova(dv='Scores', between='Group', within='Time', subject='Subject', correction=False, data=df))) # Test parwise correlations corrs = data.pairwise_corr(columns=['X', 'M', 'Y'], method='spearman') corrs2 = pg.pairwise_corr(data=data, columns=['X', 'M', 'Y'], method='spearman') assert corrs['r'].equals(corrs2['r']) # Test partial correlation corrs = data.partial_corr(x='X', y='Y', covar='M', method='spearman') corrs2 = pg.partial_corr(x='X', y='Y', covar='M', method='spearman', data=data) assert corrs['r'].equals(corrs2['r']) # Test partial correlation matrix (compare with the ppcor package) corrs = data.iloc[:, :5].pcorr().round(3) np.testing.assert_array_equal(corrs.iloc[0, :].to_numpy(), [1, 0.392, 0.06, -0.014, -0.149]) # Now compare against Pingouin's own partial_corr function corrs = data[['X', 'Y', 'M']].pcorr() corrs2 = data.partial_corr(x='X', y='Y', covar='M') assert np.isclose(corrs.at['X', 'Y'], corrs2.at['pearson', 'r']) # Test rcorr (correlation matrix with p-values) # We compare against Pingouin pairwise_corr function corrs = df_corr.rcorr(padjust='holm', decimals=4) corrs2 = df_corr.pairwise_corr(padjust='holm').round(4) assert corrs.at['Neuroticism', 'Agreeableness'] == '*' assert (corrs.at['Agreeableness', 'Neuroticism'] == str(corrs2.at[2, 'r'])) corrs = df_corr.rcorr(padjust='holm', stars=False, decimals=4) assert (corrs.at['Neuroticism', 'Agreeableness'] == str(corrs2.at[2, 'p-corr'].round(4))) corrs = df_corr.rcorr(upper='n', decimals=5) corrs2 = df_corr.pairwise_corr().round(5) assert corrs.at['Extraversion', 'Openness'] == corrs2.at[4, 'n'] assert corrs.at['Openness', 'Extraversion'] == str(corrs2.at[4, 'r']) # Method = spearman does not work with Python 3.5 on Travis? # Instead it seems to return the Pearson correlation! df_corr.rcorr(method='spearman') df_corr.rcorr() # Test mediation analysis med = data.mediation_analysis(x='X', m='M', y='Y', seed=42, n_boot=500) np.testing.assert_array_equal(med.loc[:, 'coef'].round(4).to_numpy(), [0.5610, 0.6542, 0.3961, 0.0396, 0.3565])
def test_pandas(self): """Test pandas method. """ # Test the ANOVA (Pandas) aov = df.anova(dv='Scores', between='Group', detailed=True) assert aov.equals( pg.anova(dv='Scores', between='Group', detailed=True, data=df)) # Test the Welch ANOVA (Pandas) aov = df.welch_anova(dv='Scores', between='Group') assert aov.equals(pg.welch_anova(dv='Scores', between='Group', data=df)) # Test the repeated measures ANOVA (Pandas) aov = df.rm_anova(dv='Scores', within='Time', subject='Subject', detailed=True) assert aov.equals( pg.rm_anova(dv='Scores', within='Time', subject='Subject', detailed=True, data=df)) # FDR-corrected post hocs with Hedges'g effect size ttests = df.pairwise_ttests(dv='Scores', within='Time', subject='Subject', padjust='fdr_bh', effsize='hedges') assert ttests.equals( pg.pairwise_ttests(dv='Scores', within='Time', subject='Subject', padjust='fdr_bh', effsize='hedges', data=df)) # Test two-way mixed ANOVA aov = df.mixed_anova(dv='Scores', between='Group', within='Time', subject='Subject', correction=False) assert aov.equals( pg.mixed_anova(dv='Scores', between='Group', within='Time', subject='Subject', correction=False, data=df)) # Test parwise correlations corrs = data.pairwise_corr(columns=['X', 'M', 'Y'], method='spearman') corrs2 = pg.pairwise_corr(data=data, columns=['X', 'M', 'Y'], method='spearman') assert corrs['r'].equals(corrs2['r']) # Test partial correlation corrs = data.partial_corr(x='X', y='Y', covar='M', method='spearman') corrs2 = pg.partial_corr(x='X', y='Y', covar='M', method='spearman', data=data) assert corrs['r'].equals(corrs2['r']) # Test partial correlation matrix (compare with the ppcor package) corrs = data.pcorr().round(3) np.testing.assert_array_equal(corrs.iloc[0, :].values, [1, 0.392, 0.06, -0.014, -0.149]) # Now compare against Pingouin's own partial_corr function corrs = data[['X', 'Y', 'M']].pcorr() corrs2 = data.partial_corr(x='X', y='Y', covar='M') assert round(corrs.loc['X', 'Y'], 3) == corrs2.loc['pearson', 'r'] # Test mediation analysis med = data.mediation_analysis(x='X', m='M', y='Y', seed=42, n_boot=500) np.testing.assert_array_equal(med.loc[:, 'coef'].values, [0.5610, 0.6542, 0.3961, 0.0396, 0.3565])
x='standarized_prediction', data=summary_frame) _ = plt.axhline(y=0) # #### # This graph can be used for testing homogeneity of variance. We encountered this kind of plot previously; essentially, if it has a funnel shape then we’re in trouble. The plot we have shows points that are equally spread for the three groups, which implies that variances are similar across groups (which was also the conclusion reached by Levene’s test). # In[14]: _ = pg.qqplot(summary_frame['standard_resid'], confidence=False) # #### # The second plot is a Q-Q plot , which tells us something about the normality of residuals in the model. We want our residuals to be normally distributed, which means that the dots on the graph should cling to the diagonal line. Ours look like they have had a bit of an argument with the diagonal line, which suggests that we may not be able to assume normality of errors and should perhaps use a robust version of ANOVA instead. # In[15]: # Doing Welch anova in the case if homogeniety of variance is violated(our data here dont need this test) aov = pg.welch_anova(dv='libido', between='dose', data=df) aov # ## Robust ANOVA (for independent samples) # In[16]: st.kruskal(df_dose1['libido'], df_dose2['libido'], df_dose3['libido']) # # Planned Comparison # #### https://www.statsmodels.org/devel/examples/notebooks/generated/contrasts.html#examples-notebooks-generated-contrasts--page-root # In[17]: contrast1 = [-2, 1, 1]
import researchpy as rp import matplotlib.pyplot as plt from pingouin import welch_anova, read_dataset df = pd.read_csv('imdb_data_clean.csv', delimiter=';') ### BAR PLOTS OF THREE GROUPS FOR A VARIABLE # Subset three groups usa = df[df['productionlocation'] == 'USA']['rating'] coprod = df[df['productionlocation'] == 'Coproduction']['rating'] nonusa = df[df['productionlocation'] == 'Non-USA']['rating'] means = (usa.mean(), coprod.mean(), nonusa.mean()) # Calculating means std = (usa.std(), coprod.std(), nonusa.std() ) # Calculating standard deviations positions = [0, 1, 2] # Defining positions in the graph plt.bar(positions, means, yerr=std) # Compiling the plot plt.xticks(positions, ['USA', 'Coproduction', 'Non-USA'], rotation="horizontal") # Adding labels plt.savefig("barmeanstd3+.pdf") # Save figure plt.clf() # Clear figure # Get descriptive table by category print(rp.summary_cont(df['rating'].groupby(df['productionlocation']))) pd.set_option('max_columns', 9999) # Welch's ANOVA aov = welch_anova(dv='rating', between='productionlocation', data=df) print(aov)