def ancova_biotuner2d(df1, df2, method, metric, data_types, plot=False): df_tot = pd.concat([df1.loc[method], df2.loc[method]], keys=data_types).reset_index() df_tot.rename(columns={'level_0': 'data_type'}, inplace=True) df_tot = df_tot.fillna(0) if plot == True: sbn.distplot(df1.loc[method, metric]) sbn.distplot(df2.loc[method, metric]) #print('df_tot', df_tot, 'metric', metric) return ancova(data=df_tot, dv=metric, covar='peaks', between='data_type')
def rm_corr(data=None, x=None, y=None, subject=None, tail='two-sided'): """Repeated measures correlation. Parameters ---------- data : pd.DataFrame Dataframe. x, y : string Name of columns in ``data`` containing the two dependent variables. subject : string Name of column in ``data`` containing the subject indicator. tail : string Specify whether to return 'one-sided' or 'two-sided' p-value. Returns ------- stats : pandas DataFrame Test summary :: 'r' : Repeated measures correlation coefficient 'dof' : Degrees of freedom 'pval' : one or two tailed p-value 'CI95' : 95% parametric confidence intervals 'power' : achieved power of the test (= 1 - type II error). Notes ----- Repeated measures correlation (rmcorr) is a statistical technique for determining the common within-individual association for paired measures assessed on two or more occasions for multiple individuals. From Bakdash and Marusich (2017): "Rmcorr accounts for non-independence among observations using analysis of covariance (ANCOVA) to statistically adjust for inter-individual variability. By removing measured variance between-participants, rmcorr provides the best linear fit for each participant using parallel regression lines (the same slope) with varying intercepts. Like a Pearson correlation coefficient, the rmcorr coefficient is bounded by − 1 to 1 and represents the strength of the linear association between two variables." Results have been tested against the `rmcorr` R package. Please note that NaN are automatically removed from the dataframe (listwise deletion). References ---------- .. [1] Bakdash, J.Z., Marusich, L.R., 2017. Repeated Measures Correlation. Front. Psychol. 8, 456. https://doi.org/10.3389/fpsyg.2017.00456 .. [2] Bland, J. M., & Altman, D. G. (1995). Statistics notes: Calculating correlation coefficients with repeated observations: Part 1—correlation within subjects. Bmj, 310(6977), 446. .. [3] https://github.com/cran/rmcorr Examples -------- >>> import pingouin as pg >>> df = pg.read_dataset('rm_corr') >>> pg.rm_corr(data=df, x='pH', y='PacO2', subject='Subject') r dof pval CI95% power rm_corr -0.507 38 0.000847 [-0.71, -0.23] 0.93 """ from pingouin import ancova, power_corr # Safety checks assert isinstance(data, pd.DataFrame), 'Data must be a DataFrame' assert x in data.columns, 'The %s column is not in data.' % x assert y in data.columns, 'The %s column is not in data.' % y assert data[x].dtype.kind in 'bfi', '%s must be numeric.' % x assert data[y].dtype.kind in 'bfi', '%s must be numeric.' % y assert subject in data.columns, 'The %s column is not in data.' % subject if data[subject].nunique() < 3: raise ValueError('rm_corr requires at least 3 unique subjects.') # Remove missing values data = data[[x, y, subject]].dropna(axis=0) # Using PINGOUIN aov = ancova(dv=y, covar=x, between=subject, data=data) bw = aov.bw_ # Beta within parameter sign = np.sign(bw) dof = int(aov.at[2, 'DF']) n = dof + 2 ssfactor = aov.at[1, 'SS'] sserror = aov.at[2, 'SS'] rm = sign * np.sqrt(ssfactor / (ssfactor + sserror)) pval = aov.at[1, 'p-unc'] pval = pval * 0.5 if tail == 'one-sided' else pval ci = compute_esci(stat=rm, nx=n, eftype='pearson').tolist() pwr = power_corr(r=rm, n=n, tail=tail) # Convert to Dataframe stats = pd.DataFrame( { "r": round(rm, 3), "dof": int(dof), "pval": pval, "CI95%": str(ci), "power": round(pwr, 3) }, index=["rm_corr"]) return stats
ax.set_xticklabels([-80, -70, -60, -50, -40, -30], fontsize=8) ax.set_xlabel('Vm (mV)', fontsize=8) ax.set_yticks([0, 5, 10, 15]) ax.set_yticklabels([0, 5, 10, 15], fontsize=8) ax.set_ylabel('Vm variance (mV$^\mathrm{2}$)', fontsize=8) plt.tight_layout() plt.savefig(os.path.join(fig_folder, 'all_cells_var_vs_Vm_LIA.png'), transparent=True) # %% try stats - ancova # try the ancova using pingouin ancova_results = ancova(data=df, dv='var', covar=['Vm'], between='state') # for just nost and theta ancova_results = ancova(data=df[df['state'] != 'LIA'], dv='var', covar=['Vm'], between='state') # for just nost and theta, theta hyp only ancova_results = ancova(data=df[(df['state'] != 'LIA') & (df['dVm_type'] == 'hyp')], dv='var', covar=['Vm'], between='state') # %% stats - paired stats by cell for slope and intercept # I think this is the best stats for this # linear regression of variance vs Vm for each cell, then compare slopes and # intercepts with paired bootstrap for nost vs theta and nost vs LIA S_slope = np.full([len(data), len(ntl)], np.nan) S_intcpt = np.full([len(data), len(ntl)], np.nan) n = np.full([len(data), len(ntl)], np.nan)
# %% # Load required librarie import pandas as pd import pingouin as pg # %% # Load data from Excel to a pandas dataframe def load_sample_from_Excel(): directory = ("../../../Google Drive/Academia/PhD Thesis/" + "Charts, Tables, Forms, Flowcharts, Spreadsheets/") input_file = "Paper I - SVR and ANN Results.xlsx" input_path = directory + input_file sheets_dict = pd.read_excel(input_path, sheet_name=["SVR - ANCOVA", "ANN - ANCOVA"], header=0) df_svr = sheets_dict["SVR - ANCOVA"] df_ann = sheets_dict["ANN - ANCOVA"] return df_svr, df_ann # %% # Perform the ANCOVA df_svr, df_ann = load_sample_from_Excel() # stats.normaltest(df_svr["AGE"]) pg.ancova(data=df_svr, dv="SCORE", covar="ENGINE_DISPLACEMENT", between="CAR_SEGMENT")
def test_pandas(self): """Test pandas method. """ # Test the ANOVA (Pandas) aov = df.anova(dv='Scores', between='Group', detailed=True) assert aov.equals( pg.anova(dv='Scores', between='Group', detailed=True, data=df)) aov3_ss1 = df_aov3.anova(dv='Cholesterol', between=['Sex', 'Drug'], ss_type=1) aov3_ss2 = df_aov3.anova(dv='Cholesterol', between=['Sex', 'Drug'], ss_type=2) aov3_ss2_pg = pg.anova(dv='Cholesterol', between=['Sex', 'Drug'], data=df_aov3, ss_type=2) assert not aov3_ss1.equals(aov3_ss2) assert aov3_ss2.round(3).equals(aov3_ss2_pg.round(3)) # Test the Welch ANOVA (Pandas) aov = df.welch_anova(dv='Scores', between='Group') assert aov.equals(pg.welch_anova(dv='Scores', between='Group', data=df)) # Test the ANCOVA aov = df_anc.ancova(dv='Scores', covar='Income', between='Method').round(3) assert (aov.equals( pg.ancova(data=df_anc, dv='Scores', covar='Income', between='Method').round(3))) # Test the repeated measures ANOVA (Pandas) aov = df.rm_anova(dv='Scores', within='Time', subject='Subject', detailed=True) assert (aov.equals( pg.rm_anova(dv='Scores', within='Time', subject='Subject', detailed=True, data=df))) # FDR-corrected post hocs with Hedges'g effect size ttests = df.pairwise_tests(dv='Scores', within='Time', subject='Subject', padjust='fdr_bh', effsize='hedges') assert (ttests.equals( pg.pairwise_tests(dv='Scores', within='Time', subject='Subject', padjust='fdr_bh', effsize='hedges', data=df))) # Pairwise Tukey tukey = df.pairwise_tukey(dv='Scores', between='Group') assert tukey.equals( pg.pairwise_tukey(data=df, dv='Scores', between='Group')) # Test two-way mixed ANOVA aov = df.mixed_anova(dv='Scores', between='Group', within='Time', subject='Subject', correction=False) assert (aov.equals( pg.mixed_anova(dv='Scores', between='Group', within='Time', subject='Subject', correction=False, data=df))) # Test parwise correlations corrs = data.pairwise_corr(columns=['X', 'M', 'Y'], method='spearman') corrs2 = pg.pairwise_corr(data=data, columns=['X', 'M', 'Y'], method='spearman') assert corrs['r'].equals(corrs2['r']) # Test partial correlation corrs = data.partial_corr(x='X', y='Y', covar='M', method='spearman') corrs2 = pg.partial_corr(x='X', y='Y', covar='M', method='spearman', data=data) assert corrs['r'].equals(corrs2['r']) # Test partial correlation matrix (compare with the ppcor package) corrs = data.iloc[:, :5].pcorr().round(3) np.testing.assert_array_equal(corrs.iloc[0, :].to_numpy(), [1, 0.392, 0.06, -0.014, -0.149]) # Now compare against Pingouin's own partial_corr function corrs = data[['X', 'Y', 'M']].pcorr() corrs2 = data.partial_corr(x='X', y='Y', covar='M') assert np.isclose(corrs.at['X', 'Y'], corrs2.at['pearson', 'r']) # Test rcorr (correlation matrix with p-values) # We compare against Pingouin pairwise_corr function corrs = df_corr.rcorr(padjust='holm', decimals=4) corrs2 = df_corr.pairwise_corr(padjust='holm').round(4) assert corrs.at['Neuroticism', 'Agreeableness'] == '*' assert (corrs.at['Agreeableness', 'Neuroticism'] == str(corrs2.at[2, 'r'])) corrs = df_corr.rcorr(padjust='holm', stars=False, decimals=4) assert (corrs.at['Neuroticism', 'Agreeableness'] == str(corrs2.at[2, 'p-corr'].round(4))) corrs = df_corr.rcorr(upper='n', decimals=5) corrs2 = df_corr.pairwise_corr().round(5) assert corrs.at['Extraversion', 'Openness'] == corrs2.at[4, 'n'] assert corrs.at['Openness', 'Extraversion'] == str(corrs2.at[4, 'r']) # Method = spearman does not work with Python 3.5 on Travis? # Instead it seems to return the Pearson correlation! df_corr.rcorr(method='spearman') df_corr.rcorr() # Test mediation analysis med = data.mediation_analysis(x='X', m='M', y='Y', seed=42, n_boot=500) np.testing.assert_array_equal(med.loc[:, 'coef'].round(4).to_numpy(), [0.5610, 0.6542, 0.3961, 0.0396, 0.3565])
def rm_corr(data=None, x=None, y=None, subject=None, tail='two-sided'): """Repeated measures correlation. Parameters ---------- data : pd.DataFrame Dataframe containing the variables x, y : string Name of columns in data containing the two dependent variables subject : string Name of column in data containing the subject indicator tail : string Specify whether to return 'one-sided' or 'two-sided' p-value. Returns ------- r : float Repeated measures correlation coefficient p : float P-value dof : int Degrees of freedom Notes ----- Repeated measures correlation [1]_ is a statistical technique for determining the common within-individual association for paired measures assessed on two or more occasions for multiple individuals. Results have been tested against the `rmcorr` R package. Please note that NaN are automatically removed from the dataframe. References ---------- .. [1] Bakdash, J.Z., Marusich, L.R., 2017. Repeated Measures Correlation. Front. Psychol. 8, 456. https://doi.org/10.3389/fpsyg.2017.00456 Examples -------- 1. Repeated measures correlation >>> from pingouin import rm_corr >>> from pingouin.datasets import read_dataset >>> df = read_dataset('rm_corr') >>> # Compute the repeated measure correlation >>> rm_corr(data=df, x='pH', y='PacO2', subject='Subject') (-0.507, 0.0008, 38) """ # Remove Nans data = data[[x, y, subject]].dropna(axis=0) # Using STATSMODELS # from pingouin.utils import is_statsmodels_installed # is_statsmodels_installed(raise_error=True) # from statsmodels.api import stats # from statsmodels.formula.api import ols # # ANCOVA model # formula = y + ' ~ ' + 'C(' + subject + ') + ' + x # model = ols(formula, data=data).fit() # table = stats.anova_lm(model, typ=3) # # Extract the sign of the correlation and dof # sign = np.sign(model.params[x]) # dof = int(table.loc['Residual', 'df']) # # Extract correlation coefficient from sum of squares # ssfactor = table.loc[x, 'sum_sq'] # sserror = table.loc['Residual', 'sum_sq'] # rm = sign * np.sqrt(ssfactor / (ssfactor + sserror)) # # Extract p-value # pval = table.loc[x, 'PR(>F)'] # pval *= 0.5 if tail == 'one-sided' else 1 # Using PINGOUIN from pingouin import ancova aov, bw = ancova(dv=y, covar=x, between=subject, data=data, return_bw=True) sign = np.sign(bw) dof = int(aov.loc[2, 'DF']) ssfactor = aov.loc[1, 'SS'] sserror = aov.loc[2, 'SS'] rm = sign * np.sqrt(ssfactor / (ssfactor + sserror)) pval = aov.loc[1, 'p-unc'] pval *= 0.5 if tail == 'one-sided' else 1 return np.round(rm, 3), pval, dof
def rm_corr(data=None, x=None, y=None, subject=None, tail='two-sided'): """Repeated measures correlation. Parameters ---------- data : :py:class:`pandas.DataFrame` Dataframe. x, y : string Name of columns in ``data`` containing the two dependent variables. subject : string Name of column in ``data`` containing the subject indicator. tail : string Specify whether to return 'one-sided' or 'two-sided' p-value. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'r'``: Repeated measures correlation coefficient * ``'dof'``: Degrees of freedom * ``'pval'``: one or two tailed p-value * ``'CI95'``: 95% parametric confidence intervals * ``'power'``: achieved power of the test (= 1 - type II error). See also -------- plot_rm_corr Notes ----- Repeated measures correlation (rmcorr) is a statistical technique for determining the common within-individual association for paired measures assessed on two or more occasions for multiple individuals. From `Bakdash and Marusich (2017) <https://doi.org/10.3389/fpsyg.2017.00456>`_: *Rmcorr accounts for non-independence among observations using analysis of covariance (ANCOVA) to statistically adjust for inter-individual variability. By removing measured variance between-participants, rmcorr provides the best linear fit for each participant using parallel regression lines (the same slope) with varying intercepts. Like a Pearson correlation coefficient, the rmcorr coefficient is bounded by − 1 to 1 and represents the strength of the linear association between two variables.* Results have been tested against the `rmcorr <https://github.com/cran/rmcorr>`_ R package. Please note that missing values are automatically removed from the dataframe (listwise deletion). Examples -------- >>> import pingouin as pg >>> df = pg.read_dataset('rm_corr') >>> pg.rm_corr(data=df, x='pH', y='PacO2', subject='Subject') r dof pval CI95% power rm_corr -0.50677 38 0.000847 [-0.71, -0.23] 0.929579 Now plot using the :py:func:`pingouin.plot_rm_corr` function: .. plot:: >>> import pingouin as pg >>> df = pg.read_dataset('rm_corr') >>> g = pg.plot_rm_corr(data=df, x='pH', y='PacO2', subject='Subject') """ from pingouin import ancova, power_corr # Safety checks assert isinstance(data, pd.DataFrame), 'Data must be a DataFrame' assert x in data.columns, 'The %s column is not in data.' % x assert y in data.columns, 'The %s column is not in data.' % y assert data[x].dtype.kind in 'bfiu', '%s must be numeric.' % x assert data[y].dtype.kind in 'bfiu', '%s must be numeric.' % y assert subject in data.columns, 'The %s column is not in data.' % subject if data[subject].nunique() < 3: raise ValueError('rm_corr requires at least 3 unique subjects.') # Remove missing values data = data[[x, y, subject]].dropna(axis=0) # Using PINGOUIN # For max precision, make sure rounding is disabled old_options = options.copy() options['round'] = None aov = ancova(dv=y, covar=x, between=subject, data=data) options.update(old_options) # restore options bw = aov.bw_ # Beta within parameter sign = np.sign(bw) dof = int(aov.at[2, 'DF']) n = dof + 2 ssfactor = aov.at[1, 'SS'] sserror = aov.at[2, 'SS'] rm = sign * np.sqrt(ssfactor / (ssfactor + sserror)) pval = aov.at[1, 'p-unc'] pval = pval * 0.5 if tail == 'one-sided' else pval ci = compute_esci(stat=rm, nx=n, eftype='pearson').tolist() pwr = power_corr(r=rm, n=n, tail=tail) # Convert to Dataframe stats = pd.DataFrame({"r": rm, "dof": int(dof), "pval": pval, "CI95%": [ci], "power": pwr}, index=["rm_corr"]) return _postprocess_dataframe(stats)
import pandas as pd from pingouin import ancova df = pd.read_csv("Analysis/AncovaSpreadsheet.csv") # print(df.head()) TimeStepsvsAcc = ancova( data=df, dv='Study1Question Accuracy', covar=['Number of Flows', 'Number of crossover flows', 'Number of Groups'], between='Number of Timesteps') print(TimeStepsvsAcc.head(1)) FlowsvsAcc = ancova(data=df, dv='Study1Question Accuracy', covar=[ 'Number of Timesteps', 'Number of crossover flows', 'Number of Groups' ], between='Number of Flows') print(FlowsvsAcc.head(1)) CrossovervsAcc = ancova( data=df, dv='Study1Question Accuracy', covar=['Number of Timesteps', 'Number of Flows', 'Number of Groups'], between='Number of crossover flows') print(CrossovervsAcc.head(1)) GroupsvsAcc = ancova(data=df,
# In[ ]: # ANCOVA from pingouin import ancova ancova_data = pd.DataFrame({ 'Material': v_t, 'Pre': pre_score_sum, 'Post': post_score_sum }) print(ancova_data) print('\n' + 'Learn with video') print(ancova_data[ancova_data['Material'] == 'Video'].describe().round(2)) print('\n' + 'Learn with text') print(ancova_data[ancova_data['Material'] == 'Text'].describe().round(2)) ancova(data=ancova_data, dv='Post', covar='Pre', between='Material') #From the ANCOVA table we see that the p-value (p-unc = “uncorrected p-value”) for 'material' is 0.133439. #Since this value is more than 0.05, we cannot reject the null hypothesis (H0) that states #"There is no deference of performance in between two groups". #Ref. https://pingouin-stats.org/generated/pingouin.ancova.html#pingouin.ancova # In[ ]: # Visualizing changes of average scores time_pre = pd.DataFrame({'Pre': np.repeat('Pre', 20)}) time_post = pd.DataFrame({'Post': np.repeat('Post', 20)}) seaborn_data = pd.DataFrame({ 'Material': ancova_data['Material'].append(ancova_data['Material'], ignore_index=True), 'Scores':
x = sp_ntl[l]['pre_sp_Vm'] y = sp_ntl[l]['thresh'] inds = np.logical_and(x != 0, np.abs(stats.zscore(x / y)) < 4) state_df['absolute_threshold'] = sp_ntl[l]['thresh'][inds] state_df['relative_threshold'] = -1 * sp_ntl[l]['th_dist'][inds] state_df['pre_spike_Vm'] = sp_ntl[l]['pre_sp_Vm'][inds] state_df['prior_isi'] = sp_ntl[l]['prior_isi'][inds] state_df['state'] = np.full(sp_ntl[l]['thresh'][inds].size, ntl[l]) sp_ntl[l]['state_df'] = state_df df = pd.concat([d['state_df'] for d in sp_ntl]) #df = pd.concat([sp_ntl[0]['state_df'], sp_ntl[2]['state_df']]) # try the ancova using pingouin #ancova_results = ancova(data=df, dv='absolute_threshold', # covar=['pre_spike_Vm', 'prior_isi'], between='state') ancova_results = ancova(data=df, dv='absolute_threshold', covar='pre_spike_Vm', between='state') ## test whether the slopes are different - don't know how to interpret?? #lm = ols(formula = 'absolute_threshold ~ pre_spike_Vm * state', data = df) #fit = lm.fit() #fit.summary() # %% # stats for pre_spike_Vm vs absolute_threshold over cells instead of over spikes S_slope = np.full([len(data), len(ntl)], np.nan) S_intcpt = np.full([len(data), len(ntl)], np.nan) n = np.full([len(data), len(ntl)], np.nan) # do the linear regression for each cell and each state for i in np.arange(len(data)):