Exemple #1
0
def _long_to_wide_rm(data, dv=None, within=None, subject=None):
    """Convert long-format dataframe to wide-format.
    This internal function is used in pingouin.epsilon and pingouin.sphericity.
    """
    # Check arguments
    assert isinstance(dv, str), 'dv must be a string.'
    assert isinstance(subject, str), 'subject must be a string.'
    assert isinstance(within, (str, list)), 'within must be a string or list.'
    # Check that all columns are present
    assert dv in data.columns, '%s not in data' % dv
    assert data[dv].dtype.kind in 'bfiu', '%s must be numeric' % dv
    assert subject in data.columns, '%s not in data' % subject
    assert not data[subject].isnull().any(), 'Cannot have NaN in %s' % subject
    if isinstance(within, str):
        within = [within]  # within = ['fac1'] or ['fac1', 'fac2']
    for w in within:
        assert w in data.columns, '%s not in data' % w
    # Keep all relevant columns and reset index
    data = data[_fl([subject, within, dv])]
    # Convert to wide-format + collapse to the mean
    data = pd.pivot_table(data,
                          index=subject,
                          values=dv,
                          columns=within,
                          aggfunc='mean',
                          dropna=True)
    return data
Exemple #2
0
def mediation_analysis(data=None,
                       x=None,
                       m=None,
                       y=None,
                       covar=None,
                       alpha=0.05,
                       n_boot=500,
                       seed=None,
                       return_dist=False):
    """Mediation analysis using a bias-correct non-parametric bootstrap method.

    Parameters
    ----------
    data : pd.DataFrame
        Dataframe.
    x : str
        Column name in data containing the predictor variable.
        The predictor variable must be continuous.
    m : str or list of str
        Column name(s) in data containing the mediator variable(s).
        The mediator(s) can be continuous or binary (e.g. 0 or 1).
        This function supports multiple parallel mediators.
    y : str
        Column name in data containing the outcome variable.
        The outcome variable must be continuous.
    covar : None, str, or list
        Covariate(s). If not None, the specified covariate(s) will be included
        in all regressions.
    alpha : float
        Significance threshold. Used to determine the confidence interval,
        CI = [ alpha / 2 ; 1 -  alpha / 2]
    n_boot : int
        Number of bootstrap iterations for confidence intervals and p-values
        estimation. The greater, the slower.
    seed : int or None
        Random state seed.
    return_dist : bool
        If True, the function also returns the indirect bootstrapped beta
        samples (size = n_boot). Can be plotted for instance using
        :py:func:`seaborn.distplot()` or :py:func:`seaborn.kdeplot()`
        functions.

    Returns
    -------
    stats : pd.DataFrame
        Mediation summary::

        'path' : regression model
        'coef' : regression estimates
        'se' : standard error
        'CI[2.5%]' : lower confidence interval
        'CI[97.5%]' : upper confidence interval
        'pval' : two-sided p-values
        'sig' : statistical significance

    Notes
    -----
    Mediation analysis is a "statistical procedure to test
    whether the effect of an independent variable X on a dependent variable
    Y (i.e., X → Y) is at least partly explained by a chain of effects of the
    independent variable on an intervening mediator variable M and of the
    intervening variable on the dependent variable (i.e., X → M → Y)"
    (from Fiedler et al. 2011).

    The **indirect effect** (also referred to as average causal mediation
    effect or ACME) of X on Y through mediator M quantifies the estimated
    difference in Y resulting from a one-unit change in X through a sequence of
    causal steps in which X affects M, which in turn affects Y.
    It is considered significant if the specified confidence interval does not
    include 0. The path 'X --> Y' is the sum of both the indirect and direct
    effect. It is sometimes referred to as total effect. For more details,
    please refer to Fiedler et al 2011 or Hayes and Rockwood 2017.

    A linear regression is used if the mediator variable is continuous and a
    logistic regression if the mediator variable is dichotomous (binary). Note
    that this function also supports parallel multiple mediators: "in such
    models, mediators may be and often are correlated, but nothing in the
    model allows one mediator to causally influence another."
    (Hayes and Rockwood 2017)

    This function wll only work well if the outcome variable is continuous.
    It does not support binary or ordinal outcome variable. For more
    advanced mediation models, please refer to the `lavaan` or `mediation` R
    packages, or the PROCESS macro for SPSS.

    The two-sided p-value of the indirect effect is computed using the
    bootstrap distribution, as in the mediation R package. However, the p-value
    should be interpreted with caution since it is a) not constructed
    conditioned on a true null hypothesis (see Hayes and Rockwood 2017) and b)
    varies depending on the number of bootstrap samples and the random seed.

    Note that rows with NaN are automatically removed.

    Results have been tested against the R mediation package and this tutorial
    https://data.library.virginia.edu/introduction-to-mediation-analysis/

    References
    ----------
    .. [1] Baron, R. M. & Kenny, D. A. The moderator–mediator variable
           distinction in social psychological research: Conceptual, strategic,
           and statistical considerations. J. Pers. Soc. Psychol. 51, 1173–1182
           (1986).

    .. [2] Fiedler, K., Schott, M. & Meiser, T. What mediation analysis can
           (not) do. J. Exp. Soc. Psychol. 47, 1231–1236 (2011).

    .. [3] Hayes, A. F. & Rockwood, N. J. Regression-based statistical
           mediation and moderation analysis in clinical research:
           Observations, recommendations, and implementation. Behav. Res.
           Ther. 98, 39–57 (2017).

    .. [4] https://cran.r-project.org/web/packages/mediation/mediation.pdf

    .. [5] http://lavaan.ugent.be/tutorial/mediation.html

    .. [6] https://github.com/rmill040/pymediation

    Examples
    --------
    1. Simple mediation analysis

    >>> from pingouin import mediation_analysis, read_dataset
    >>> df = read_dataset('mediation')
    >>> mediation_analysis(data=df, x='X', m='M', y='Y', alpha=0.05, seed=42)
           path    coef      se          pval  CI[2.5%]  CI[97.5%]  sig
    0     M ~ X  0.5610  0.0945  4.391362e-08    0.3735     0.7485  Yes
    1     Y ~ M  0.6542  0.0858  1.612674e-11    0.4838     0.8245  Yes
    2     Total  0.3961  0.1112  5.671128e-04    0.1755     0.6167  Yes
    3    Direct  0.0396  0.1096  7.187429e-01   -0.1780     0.2572   No
    4  Indirect  0.3565  0.0833  0.000000e+00    0.2198     0.5377  Yes

    2. Return the indirect bootstrapped beta coefficients

    >>> stats, dist = mediation_analysis(data=df, x='X', m='M', y='Y',
    ...                                  return_dist=True)
    >>> print(dist.shape)
    (500,)

    3. Mediation analysis with a binary mediator variable

    >>> mediation_analysis(data=df, x='X', m='Mbin', y='Y', seed=42)
           path    coef      se      pval  CI[2.5%]  CI[97.5%]  sig
    0  Mbin ~ X -0.0205  0.1159  0.859392   -0.2476     0.2066   No
    1  Y ~ Mbin -0.1354  0.4118  0.743076   -0.9525     0.6818   No
    2     Total  0.3961  0.1112  0.000567    0.1755     0.6167  Yes
    3    Direct  0.3956  0.1117  0.000614    0.1739     0.6173  Yes
    4  Indirect  0.0023  0.0495  0.960000   -0.0715     0.1441   No

    4. Mediation analysis with covariates

    >>> mediation_analysis(data=df, x='X', m='M', y='Y',
    ...                    covar=['Mbin', 'Ybin'], seed=42)
           path    coef      se          pval  CI[2.5%]  CI[97.5%]  sig
    0     M ~ X  0.5594  0.0968  9.394635e-08    0.3672     0.7516  Yes
    1     Y ~ M  0.6660  0.0861  1.017261e-11    0.4951     0.8368  Yes
    2     Total  0.4204  0.1129  3.324252e-04    0.1962     0.6446  Yes
    3    Direct  0.0645  0.1104  5.608583e-01   -0.1548     0.2837   No
    4  Indirect  0.3559  0.0865  0.000000e+00    0.2093     0.5530  Yes

    5. Mediation analysis with multiple parallel mediators

    >>> mediation_analysis(data=df, x='X', m=['M', 'Mbin'], y='Y', seed=42)
                path    coef      se          pval  CI[2.5%]  CI[97.5%]  sig
    0          M ~ X  0.5610  0.0945  4.391362e-08    0.3735     0.7485  Yes
    1       Mbin ~ X -0.0051  0.0290  8.592408e-01   -0.0626     0.0523   No
    2          Y ~ M  0.6537  0.0863  2.118163e-11    0.4824     0.8250  Yes
    3       Y ~ Mbin -0.0640  0.3282  8.456998e-01   -0.7154     0.5873   No
    4          Total  0.3961  0.1112  5.671128e-04    0.1755     0.6167  Yes
    5         Direct  0.0395  0.1102  7.206301e-01   -0.1792     0.2583   No
    6     Indirect M  0.3563  0.0845  0.000000e+00    0.2148     0.5385  Yes
    7  Indirect Mbin  0.0003  0.0097  9.520000e-01   -0.0172     0.0252   No
    """
    # Sanity check
    assert isinstance(x, str), 'y must be a string.'
    assert isinstance(y, str), 'y must be a string.'
    assert isinstance(m, (list, str)), 'Mediator(s) must be a list or string.'
    assert isinstance(covar, (type(None), str, list))
    if isinstance(m, str):
        m = [m]
    n_mediator = len(m)
    assert isinstance(data, pd.DataFrame), 'Data must be a DataFrame.'
    # Check for duplicates
    assert n_mediator == len(set(m)), 'Cannot have duplicates mediators.'
    if isinstance(covar, str):
        covar = [covar]
    if isinstance(covar, list):
        assert len(covar) == len(set(covar)), 'Cannot have duplicates covar.'
        assert set(m).isdisjoint(covar), 'Mediator cannot be in covar.'
    # Check that columns are in dataframe
    columns = _fl([x, m, y, covar])
    keys = data.columns
    assert all([c in keys for c in columns]), 'Column(s) are not in DataFrame.'
    # Check that columns are numeric
    err_msg = "Columns must be numeric or boolean."
    assert all([data[c].dtype.kind in 'bfi' for c in columns]), err_msg

    # Drop rows with NAN Values
    data = data[columns].dropna()
    n = data.shape[0]
    assert n > 5, 'DataFrame must have at least 5 samples (rows).'

    # Check if mediator is binary
    mtype = 'logistic' if all(data[m].nunique() == 2) else 'linear'

    # Name of CI
    ll_name = 'CI[%.1f%%]' % (100 * alpha / 2)
    ul_name = 'CI[%.1f%%]' % (100 * (1 - alpha / 2))

    # Compute regressions
    cols = ['names', 'coef', 'se', 'pval', ll_name, ul_name]

    # For speed, we pass np.array instead of pandas DataFrame
    X_val = data[_fl([x, covar])].values  # X + covar as predictors
    XM_val = data[_fl([x, m, covar])].values  # X + M + covar as predictors
    M_val = data[m].values  # M as target (no covariates)
    y_val = data[y].values  # y as target (no covariates)

    # M(j) ~ X + covar
    sxm = {}
    for idx, j in enumerate(m):
        if mtype == 'linear':
            sxm[j] = linear_regression(X_val, M_val[:, idx],
                                       alpha=alpha).loc[[1], cols]
        else:
            sxm[j] = logistic_regression(X_val, M_val[:, idx],
                                         alpha=alpha).loc[[1], cols]
        sxm[j].loc[1, 'names'] = '%s ~ X' % j
    sxm = pd.concat(sxm, ignore_index=True)

    # Y ~ M + covar
    smy = linear_regression(data[_fl([m, covar])], y_val,
                            alpha=alpha).loc[1:n_mediator, cols]

    # Average Total Effects (Y ~ X + covar)
    sxy = linear_regression(X_val, y_val, alpha=alpha).loc[[1], cols]

    # Average Direct Effects (Y ~ X + M + covar)
    direct = linear_regression(XM_val, y_val, alpha=alpha).loc[[1], cols]

    # Rename paths
    smy['names'] = smy['names'].apply(lambda x: 'Y ~ %s' % x)
    direct.loc[1, 'names'] = 'Direct'
    sxy.loc[1, 'names'] = 'Total'

    # Concatenate and create sig column
    stats = pd.concat((sxm, smy, sxy, direct), ignore_index=True)
    stats['sig'] = np.where(stats['pval'] < alpha, 'Yes', 'No')

    # Bootstrap confidence intervals
    rng = np.random.RandomState(seed)
    idx = rng.choice(np.arange(n), replace=True, size=(n_boot, n))
    ab_estimates = np.zeros(shape=(n_boot, n_mediator))
    for i in range(n_boot):
        ab_estimates[i, :] = _point_estimate(X_val, XM_val, M_val, y_val,
                                             idx[i, :], n_mediator, mtype)

    ab = _point_estimate(X_val, XM_val, M_val, y_val, np.arange(n), n_mediator,
                         mtype)
    indirect = {
        'names': m,
        'coef': ab,
        'se': ab_estimates.std(ddof=1, axis=0),
        'pval': [],
        ll_name: [],
        ul_name: [],
        'sig': []
    }

    for j in range(n_mediator):
        ci_j = _bca(ab_estimates[:, j],
                    indirect['coef'][j],
                    alpha=alpha,
                    n_boot=n_boot)
        indirect[ll_name].append(min(ci_j))
        indirect[ul_name].append(max(ci_j))
        # Bootstrapped p-value of indirect effect
        # Note that this is less accurate than a permutation test because the
        # bootstrap distribution is not conditioned on a true null hypothesis.
        # For more details see Hayes and Rockwood 2017
        indirect['pval'].append(
            _pval_from_bootci(ab_estimates[:, j], indirect['coef'][j]))
        indirect['sig'].append('Yes' if indirect['pval'][j] < alpha else 'No')

    # Create output dataframe
    indirect = pd.DataFrame.from_dict(indirect)
    if n_mediator == 1:
        indirect['names'] = 'Indirect'
    else:
        indirect['names'] = indirect['names'].apply(
            lambda x: 'Indirect %s' % x)
    stats = stats.append(indirect, ignore_index=True)
    stats = stats.rename(columns={'names': 'path'})

    # Round
    col_to_round = ['coef', 'se', ll_name, ul_name]
    stats[col_to_round] = stats[col_to_round].round(4)

    if return_dist:
        return stats, np.squeeze(ab_estimates)
    else:
        return stats