Esempio n. 1
0
def ancova_biotuner2d(df1, df2, method, metric, data_types, plot=False):
    df_tot = pd.concat([df1.loc[method], df2.loc[method]],
                       keys=data_types).reset_index()
    df_tot.rename(columns={'level_0': 'data_type'}, inplace=True)
    df_tot = df_tot.fillna(0)
    if plot == True:
        sbn.distplot(df1.loc[method, metric])
        sbn.distplot(df2.loc[method, metric])
    #print('df_tot', df_tot, 'metric', metric)
    return ancova(data=df_tot, dv=metric, covar='peaks', between='data_type')
Esempio n. 2
0
def rm_corr(data=None, x=None, y=None, subject=None, tail='two-sided'):
    """Repeated measures correlation.

    Parameters
    ----------
    data : pd.DataFrame
        Dataframe.
    x, y : string
        Name of columns in ``data`` containing the two dependent variables.
    subject : string
        Name of column in ``data`` containing the subject indicator.
    tail : string
        Specify whether to return 'one-sided' or 'two-sided' p-value.

    Returns
    -------
    stats : pandas DataFrame
        Test summary ::

        'r' : Repeated measures correlation coefficient
        'dof' : Degrees of freedom
        'pval' : one or two tailed p-value
        'CI95' : 95% parametric confidence intervals
        'power' : achieved power of the test (= 1 - type II error).

    Notes
    -----
    Repeated measures correlation (rmcorr) is a statistical technique
    for determining the common within-individual association for paired
    measures assessed on two or more occasions for multiple individuals.

    From Bakdash and Marusich (2017):

        "Rmcorr accounts for non-independence among observations using analysis
        of covariance (ANCOVA) to statistically adjust for inter-individual
        variability. By removing measured variance between-participants,
        rmcorr provides the best linear fit for each participant using parallel
        regression lines (the same slope) with varying intercepts.
        Like a Pearson correlation coefficient, the rmcorr coefficient
        is bounded by − 1 to 1 and represents the strength of the linear
        association between two variables."

    Results have been tested against the `rmcorr` R package.

    Please note that NaN are automatically removed from the dataframe
    (listwise deletion).

    References
    ----------
    .. [1] Bakdash, J.Z., Marusich, L.R., 2017. Repeated Measures Correlation.
           Front. Psychol. 8, 456. https://doi.org/10.3389/fpsyg.2017.00456

    .. [2] Bland, J. M., & Altman, D. G. (1995). Statistics notes: Calculating
           correlation coefficients with repeated observations:
           Part 1—correlation within subjects. Bmj, 310(6977), 446.

    .. [3] https://github.com/cran/rmcorr

    Examples
    --------
    >>> import pingouin as pg
    >>> df = pg.read_dataset('rm_corr')
    >>> pg.rm_corr(data=df, x='pH', y='PacO2', subject='Subject')
                 r  dof      pval           CI95%  power
    rm_corr -0.507   38  0.000847  [-0.71, -0.23]   0.93
    """
    from pingouin import ancova, power_corr
    # Safety checks
    assert isinstance(data, pd.DataFrame), 'Data must be a DataFrame'
    assert x in data.columns, 'The %s column is not in data.' % x
    assert y in data.columns, 'The %s column is not in data.' % y
    assert data[x].dtype.kind in 'bfi', '%s must be numeric.' % x
    assert data[y].dtype.kind in 'bfi', '%s must be numeric.' % y
    assert subject in data.columns, 'The %s column is not in data.' % subject
    if data[subject].nunique() < 3:
        raise ValueError('rm_corr requires at least 3 unique subjects.')

    # Remove missing values
    data = data[[x, y, subject]].dropna(axis=0)

    # Using PINGOUIN
    aov = ancova(dv=y, covar=x, between=subject, data=data)
    bw = aov.bw_  # Beta within parameter
    sign = np.sign(bw)
    dof = int(aov.at[2, 'DF'])
    n = dof + 2
    ssfactor = aov.at[1, 'SS']
    sserror = aov.at[2, 'SS']
    rm = sign * np.sqrt(ssfactor / (ssfactor + sserror))
    pval = aov.at[1, 'p-unc']
    pval = pval * 0.5 if tail == 'one-sided' else pval
    ci = compute_esci(stat=rm, nx=n, eftype='pearson').tolist()
    pwr = power_corr(r=rm, n=n, tail=tail)
    # Convert to Dataframe
    stats = pd.DataFrame(
        {
            "r": round(rm, 3),
            "dof": int(dof),
            "pval": pval,
            "CI95%": str(ci),
            "power": round(pwr, 3)
        },
        index=["rm_corr"])
    return stats
Esempio n. 3
0
ax.set_xticklabels([-80, -70, -60, -50, -40, -30], fontsize=8)
ax.set_xlabel('Vm (mV)', fontsize=8)
ax.set_yticks([0, 5, 10, 15])
ax.set_yticklabels([0, 5, 10, 15], fontsize=8)
ax.set_ylabel('Vm variance (mV$^\mathrm{2}$)', fontsize=8)
plt.tight_layout()
plt.savefig(os.path.join(fig_folder, 'all_cells_var_vs_Vm_LIA.png'), transparent=True)   





# %% try stats - ancova

# try the ancova using pingouin
ancova_results = ancova(data=df, dv='var',
                        covar=['Vm'], between='state')
# for just nost and theta
ancova_results = ancova(data=df[df['state'] != 'LIA'], dv='var',
                        covar=['Vm'], between='state')
# for just nost and theta, theta hyp only
ancova_results = ancova(data=df[(df['state'] != 'LIA') & (df['dVm_type'] == 'hyp')],
                        dv='var', covar=['Vm'], between='state')

# %% stats - paired stats by cell for slope and intercept

# I think this is the best stats for this
# linear regression of variance vs Vm for each cell, then compare slopes and
# intercepts with paired bootstrap for nost vs theta and nost vs LIA
S_slope = np.full([len(data), len(ntl)], np.nan)
S_intcpt = np.full([len(data), len(ntl)], np.nan)
n = np.full([len(data), len(ntl)], np.nan)
Esempio n. 4
0
# %%
# Load required librarie
import pandas as pd
import pingouin as pg


# %%
# Load data from Excel to a pandas dataframe
def load_sample_from_Excel():
    directory = ("../../../Google Drive/Academia/PhD Thesis/" +
                 "Charts, Tables, Forms, Flowcharts, Spreadsheets/")
    input_file = "Paper I - SVR and ANN Results.xlsx"
    input_path = directory + input_file
    sheets_dict = pd.read_excel(input_path,
                                sheet_name=["SVR - ANCOVA", "ANN - ANCOVA"],
                                header=0)
    df_svr = sheets_dict["SVR - ANCOVA"]
    df_ann = sheets_dict["ANN - ANCOVA"]
    return df_svr, df_ann


# %%
# Perform the ANCOVA
df_svr, df_ann = load_sample_from_Excel()
# stats.normaltest(df_svr["AGE"])
pg.ancova(data=df_svr,
          dv="SCORE",
          covar="ENGINE_DISPLACEMENT",
          between="CAR_SEGMENT")
Esempio n. 5
0
    def test_pandas(self):
        """Test pandas method.
        """
        # Test the ANOVA (Pandas)
        aov = df.anova(dv='Scores', between='Group', detailed=True)
        assert aov.equals(
            pg.anova(dv='Scores', between='Group', detailed=True, data=df))
        aov3_ss1 = df_aov3.anova(dv='Cholesterol',
                                 between=['Sex', 'Drug'],
                                 ss_type=1)
        aov3_ss2 = df_aov3.anova(dv='Cholesterol',
                                 between=['Sex', 'Drug'],
                                 ss_type=2)
        aov3_ss2_pg = pg.anova(dv='Cholesterol',
                               between=['Sex', 'Drug'],
                               data=df_aov3,
                               ss_type=2)
        assert not aov3_ss1.equals(aov3_ss2)
        assert aov3_ss2.round(3).equals(aov3_ss2_pg.round(3))

        # Test the Welch ANOVA (Pandas)
        aov = df.welch_anova(dv='Scores', between='Group')
        assert aov.equals(pg.welch_anova(dv='Scores', between='Group',
                                         data=df))

        # Test the ANCOVA
        aov = df_anc.ancova(dv='Scores', covar='Income',
                            between='Method').round(3)
        assert (aov.equals(
            pg.ancova(data=df_anc,
                      dv='Scores',
                      covar='Income',
                      between='Method').round(3)))

        # Test the repeated measures ANOVA (Pandas)
        aov = df.rm_anova(dv='Scores',
                          within='Time',
                          subject='Subject',
                          detailed=True)
        assert (aov.equals(
            pg.rm_anova(dv='Scores',
                        within='Time',
                        subject='Subject',
                        detailed=True,
                        data=df)))

        # FDR-corrected post hocs with Hedges'g effect size
        ttests = df.pairwise_tests(dv='Scores',
                                   within='Time',
                                   subject='Subject',
                                   padjust='fdr_bh',
                                   effsize='hedges')
        assert (ttests.equals(
            pg.pairwise_tests(dv='Scores',
                              within='Time',
                              subject='Subject',
                              padjust='fdr_bh',
                              effsize='hedges',
                              data=df)))

        # Pairwise Tukey
        tukey = df.pairwise_tukey(dv='Scores', between='Group')
        assert tukey.equals(
            pg.pairwise_tukey(data=df, dv='Scores', between='Group'))

        # Test two-way mixed ANOVA
        aov = df.mixed_anova(dv='Scores',
                             between='Group',
                             within='Time',
                             subject='Subject',
                             correction=False)
        assert (aov.equals(
            pg.mixed_anova(dv='Scores',
                           between='Group',
                           within='Time',
                           subject='Subject',
                           correction=False,
                           data=df)))

        # Test parwise correlations
        corrs = data.pairwise_corr(columns=['X', 'M', 'Y'], method='spearman')
        corrs2 = pg.pairwise_corr(data=data,
                                  columns=['X', 'M', 'Y'],
                                  method='spearman')
        assert corrs['r'].equals(corrs2['r'])

        # Test partial correlation
        corrs = data.partial_corr(x='X', y='Y', covar='M', method='spearman')
        corrs2 = pg.partial_corr(x='X',
                                 y='Y',
                                 covar='M',
                                 method='spearman',
                                 data=data)
        assert corrs['r'].equals(corrs2['r'])

        # Test partial correlation matrix (compare with the ppcor package)
        corrs = data.iloc[:, :5].pcorr().round(3)
        np.testing.assert_array_equal(corrs.iloc[0, :].to_numpy(),
                                      [1, 0.392, 0.06, -0.014, -0.149])
        # Now compare against Pingouin's own partial_corr function
        corrs = data[['X', 'Y', 'M']].pcorr()
        corrs2 = data.partial_corr(x='X', y='Y', covar='M')
        assert np.isclose(corrs.at['X', 'Y'], corrs2.at['pearson', 'r'])

        # Test rcorr (correlation matrix with p-values)
        # We compare against Pingouin pairwise_corr function
        corrs = df_corr.rcorr(padjust='holm', decimals=4)
        corrs2 = df_corr.pairwise_corr(padjust='holm').round(4)
        assert corrs.at['Neuroticism', 'Agreeableness'] == '*'
        assert (corrs.at['Agreeableness',
                         'Neuroticism'] == str(corrs2.at[2, 'r']))
        corrs = df_corr.rcorr(padjust='holm', stars=False, decimals=4)
        assert (corrs.at['Neuroticism',
                         'Agreeableness'] == str(corrs2.at[2,
                                                           'p-corr'].round(4)))
        corrs = df_corr.rcorr(upper='n', decimals=5)
        corrs2 = df_corr.pairwise_corr().round(5)
        assert corrs.at['Extraversion', 'Openness'] == corrs2.at[4, 'n']
        assert corrs.at['Openness', 'Extraversion'] == str(corrs2.at[4, 'r'])
        # Method = spearman does not work with Python 3.5 on Travis?
        # Instead it seems to return the Pearson correlation!
        df_corr.rcorr(method='spearman')
        df_corr.rcorr()

        # Test mediation analysis
        med = data.mediation_analysis(x='X', m='M', y='Y', seed=42, n_boot=500)
        np.testing.assert_array_equal(med.loc[:, 'coef'].round(4).to_numpy(),
                                      [0.5610, 0.6542, 0.3961, 0.0396, 0.3565])
Esempio n. 6
0
def rm_corr(data=None, x=None, y=None, subject=None, tail='two-sided'):
    """Repeated measures correlation.

    Parameters
    ----------
    data : pd.DataFrame
        Dataframe containing the variables
    x, y : string
        Name of columns in data containing the two dependent variables
    subject : string
        Name of column in data containing the subject indicator
    tail : string
        Specify whether to return 'one-sided' or 'two-sided' p-value.

    Returns
    -------
    r : float
        Repeated measures correlation coefficient
    p : float
        P-value
    dof : int
        Degrees of freedom

    Notes
    -----
    Repeated measures correlation [1]_ is a statistical technique
    for determining the common within-individual association for paired
    measures assessed on two or more occasions for multiple individuals.

    Results have been tested against the `rmcorr` R package.

    Please note that NaN are automatically removed from the dataframe.

    References
    ----------

    .. [1] Bakdash, J.Z., Marusich, L.R., 2017. Repeated Measures Correlation.
       Front. Psychol. 8, 456. https://doi.org/10.3389/fpsyg.2017.00456

    Examples
    --------
    1. Repeated measures correlation

        >>> from pingouin import rm_corr
        >>> from pingouin.datasets import read_dataset
        >>> df = read_dataset('rm_corr')
        >>> # Compute the repeated measure correlation
        >>> rm_corr(data=df, x='pH', y='PacO2', subject='Subject')
            (-0.507, 0.0008, 38)
    """
    # Remove Nans
    data = data[[x, y, subject]].dropna(axis=0)

    # Using STATSMODELS
    # from pingouin.utils import is_statsmodels_installed
    # is_statsmodels_installed(raise_error=True)
    # from statsmodels.api import stats
    # from statsmodels.formula.api import ols
    # # ANCOVA model
    # formula = y + ' ~ ' + 'C(' + subject + ') + ' + x
    # model = ols(formula, data=data).fit()
    # table = stats.anova_lm(model, typ=3)
    # # Extract the sign of the correlation and dof
    # sign = np.sign(model.params[x])
    # dof = int(table.loc['Residual', 'df'])
    # # Extract correlation coefficient from sum of squares
    # ssfactor = table.loc[x, 'sum_sq']
    # sserror = table.loc['Residual', 'sum_sq']
    # rm = sign * np.sqrt(ssfactor / (ssfactor + sserror))
    # # Extract p-value
    # pval = table.loc[x, 'PR(>F)']
    # pval *= 0.5 if tail == 'one-sided' else 1

    # Using PINGOUIN
    from pingouin import ancova
    aov, bw = ancova(dv=y, covar=x, between=subject, data=data, return_bw=True)
    sign = np.sign(bw)
    dof = int(aov.loc[2, 'DF'])
    ssfactor = aov.loc[1, 'SS']
    sserror = aov.loc[2, 'SS']
    rm = sign * np.sqrt(ssfactor / (ssfactor + sserror))
    pval = aov.loc[1, 'p-unc']
    pval *= 0.5 if tail == 'one-sided' else 1

    return np.round(rm, 3), pval, dof
Esempio n. 7
0
def rm_corr(data=None, x=None, y=None, subject=None, tail='two-sided'):
    """Repeated measures correlation.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        Dataframe.
    x, y : string
        Name of columns in ``data`` containing the two dependent variables.
    subject : string
        Name of column in ``data`` containing the subject indicator.
    tail : string
        Specify whether to return 'one-sided' or 'two-sided' p-value.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'r'``: Repeated measures correlation coefficient
        * ``'dof'``: Degrees of freedom
        * ``'pval'``: one or two tailed p-value
        * ``'CI95'``: 95% parametric confidence intervals
        * ``'power'``: achieved power of the test (= 1 - type II error).

    See also
    --------
    plot_rm_corr

    Notes
    -----
    Repeated measures correlation (rmcorr) is a statistical technique
    for determining the common within-individual association for paired
    measures assessed on two or more occasions for multiple individuals.

    From `Bakdash and Marusich (2017)
    <https://doi.org/10.3389/fpsyg.2017.00456>`_:

        *Rmcorr accounts for non-independence among observations using analysis
        of covariance (ANCOVA) to statistically adjust for inter-individual
        variability. By removing measured variance between-participants,
        rmcorr provides the best linear fit for each participant using parallel
        regression lines (the same slope) with varying intercepts.
        Like a Pearson correlation coefficient, the rmcorr coefficient
        is bounded by − 1 to 1 and represents the strength of the linear
        association between two variables.*

    Results have been tested against the
    `rmcorr <https://github.com/cran/rmcorr>`_ R package.

    Please note that missing values are automatically removed from the
    dataframe (listwise deletion).

    Examples
    --------
    >>> import pingouin as pg
    >>> df = pg.read_dataset('rm_corr')
    >>> pg.rm_corr(data=df, x='pH', y='PacO2', subject='Subject')
                   r  dof      pval           CI95%     power
    rm_corr -0.50677   38  0.000847  [-0.71, -0.23]  0.929579

    Now plot using the :py:func:`pingouin.plot_rm_corr` function:

    .. plot::

        >>> import pingouin as pg
        >>> df = pg.read_dataset('rm_corr')
        >>> g = pg.plot_rm_corr(data=df, x='pH', y='PacO2', subject='Subject')
    """
    from pingouin import ancova, power_corr
    # Safety checks
    assert isinstance(data, pd.DataFrame), 'Data must be a DataFrame'
    assert x in data.columns, 'The %s column is not in data.' % x
    assert y in data.columns, 'The %s column is not in data.' % y
    assert data[x].dtype.kind in 'bfiu', '%s must be numeric.' % x
    assert data[y].dtype.kind in 'bfiu', '%s must be numeric.' % y
    assert subject in data.columns, 'The %s column is not in data.' % subject
    if data[subject].nunique() < 3:
        raise ValueError('rm_corr requires at least 3 unique subjects.')

    # Remove missing values
    data = data[[x, y, subject]].dropna(axis=0)

    # Using PINGOUIN
    # For max precision, make sure rounding is disabled
    old_options = options.copy()
    options['round'] = None
    aov = ancova(dv=y, covar=x, between=subject, data=data)
    options.update(old_options)  # restore options
    bw = aov.bw_  # Beta within parameter
    sign = np.sign(bw)
    dof = int(aov.at[2, 'DF'])
    n = dof + 2
    ssfactor = aov.at[1, 'SS']
    sserror = aov.at[2, 'SS']
    rm = sign * np.sqrt(ssfactor / (ssfactor + sserror))
    pval = aov.at[1, 'p-unc']
    pval = pval * 0.5 if tail == 'one-sided' else pval
    ci = compute_esci(stat=rm, nx=n, eftype='pearson').tolist()
    pwr = power_corr(r=rm, n=n, tail=tail)
    # Convert to Dataframe
    stats = pd.DataFrame({"r": rm,
                          "dof": int(dof),
                          "pval": pval,
                          "CI95%": [ci],
                          "power": pwr}, index=["rm_corr"])
    return _postprocess_dataframe(stats)
Esempio n. 8
0
import pandas as pd
from pingouin import ancova

df = pd.read_csv("Analysis/AncovaSpreadsheet.csv")

# print(df.head())

TimeStepsvsAcc = ancova(
    data=df,
    dv='Study1Question Accuracy',
    covar=['Number of Flows', 'Number of crossover flows', 'Number of Groups'],
    between='Number of Timesteps')
print(TimeStepsvsAcc.head(1))

FlowsvsAcc = ancova(data=df,
                    dv='Study1Question Accuracy',
                    covar=[
                        'Number of Timesteps', 'Number of crossover flows',
                        'Number of Groups'
                    ],
                    between='Number of Flows')
print(FlowsvsAcc.head(1))

CrossovervsAcc = ancova(
    data=df,
    dv='Study1Question Accuracy',
    covar=['Number of Timesteps', 'Number of Flows', 'Number of Groups'],
    between='Number of crossover flows')
print(CrossovervsAcc.head(1))

GroupsvsAcc = ancova(data=df,
Esempio n. 9
0
# In[ ]:

# ANCOVA
from pingouin import ancova
ancova_data = pd.DataFrame({
    'Material': v_t,
    'Pre': pre_score_sum,
    'Post': post_score_sum
})
print(ancova_data)
print('\n' + 'Learn with video')
print(ancova_data[ancova_data['Material'] == 'Video'].describe().round(2))
print('\n' + 'Learn with text')
print(ancova_data[ancova_data['Material'] == 'Text'].describe().round(2))
ancova(data=ancova_data, dv='Post', covar='Pre', between='Material')

#From the ANCOVA table we see that the p-value (p-unc = “uncorrected p-value”) for 'material' is 0.133439.
#Since this value is more than 0.05, we cannot reject the null hypothesis (H0) that states
#"There is no deference of performance in between two groups".
#Ref. https://pingouin-stats.org/generated/pingouin.ancova.html#pingouin.ancova

# In[ ]:

# Visualizing changes of average scores
time_pre = pd.DataFrame({'Pre': np.repeat('Pre', 20)})
time_post = pd.DataFrame({'Post': np.repeat('Post', 20)})
seaborn_data = pd.DataFrame({
    'Material':
    ancova_data['Material'].append(ancova_data['Material'], ignore_index=True),
    'Scores':
    x = sp_ntl[l]['pre_sp_Vm']
    y = sp_ntl[l]['thresh']
    inds = np.logical_and(x != 0, np.abs(stats.zscore(x / y)) < 4)
    state_df['absolute_threshold'] = sp_ntl[l]['thresh'][inds]
    state_df['relative_threshold'] = -1 * sp_ntl[l]['th_dist'][inds]
    state_df['pre_spike_Vm'] = sp_ntl[l]['pre_sp_Vm'][inds]
    state_df['prior_isi'] = sp_ntl[l]['prior_isi'][inds]
    state_df['state'] = np.full(sp_ntl[l]['thresh'][inds].size, ntl[l])
    sp_ntl[l]['state_df'] = state_df
df = pd.concat([d['state_df'] for d in sp_ntl])
#df = pd.concat([sp_ntl[0]['state_df'], sp_ntl[2]['state_df']])
# try the ancova using pingouin
#ancova_results = ancova(data=df, dv='absolute_threshold',
#                        covar=['pre_spike_Vm', 'prior_isi'], between='state')
ancova_results = ancova(data=df,
                        dv='absolute_threshold',
                        covar='pre_spike_Vm',
                        between='state')
## test whether the slopes are different - don't know how to interpret??
#lm = ols(formula = 'absolute_threshold ~ pre_spike_Vm * state', data = df)
#fit = lm.fit()
#fit.summary()

# %%

# stats for pre_spike_Vm vs absolute_threshold over cells instead of over spikes
S_slope = np.full([len(data), len(ntl)], np.nan)
S_intcpt = np.full([len(data), len(ntl)], np.nan)
n = np.full([len(data), len(ntl)], np.nan)

# do the linear regression for each cell and each state
for i in np.arange(len(data)):