def _long_to_wide_rm(data, dv=None, within=None, subject=None): """Convert long-format dataframe to wide-format. This internal function is used in pingouin.epsilon and pingouin.sphericity. """ # Check arguments assert isinstance(dv, str), 'dv must be a string.' assert isinstance(subject, str), 'subject must be a string.' assert isinstance(within, (str, list)), 'within must be a string or list.' # Check that all columns are present assert dv in data.columns, '%s not in data' % dv assert data[dv].dtype.kind in 'bfiu', '%s must be numeric' % dv assert subject in data.columns, '%s not in data' % subject assert not data[subject].isnull().any(), 'Cannot have NaN in %s' % subject if isinstance(within, str): within = [within] # within = ['fac1'] or ['fac1', 'fac2'] for w in within: assert w in data.columns, '%s not in data' % w # Keep all relevant columns and reset index data = data[_fl([subject, within, dv])] # Convert to wide-format + collapse to the mean data = pd.pivot_table(data, index=subject, values=dv, columns=within, aggfunc='mean', dropna=True) return data
def mediation_analysis(data=None, x=None, m=None, y=None, covar=None, alpha=0.05, n_boot=500, seed=None, return_dist=False): """Mediation analysis using a bias-correct non-parametric bootstrap method. Parameters ---------- data : pd.DataFrame Dataframe. x : str Column name in data containing the predictor variable. The predictor variable must be continuous. m : str or list of str Column name(s) in data containing the mediator variable(s). The mediator(s) can be continuous or binary (e.g. 0 or 1). This function supports multiple parallel mediators. y : str Column name in data containing the outcome variable. The outcome variable must be continuous. covar : None, str, or list Covariate(s). If not None, the specified covariate(s) will be included in all regressions. alpha : float Significance threshold. Used to determine the confidence interval, CI = [ alpha / 2 ; 1 - alpha / 2] n_boot : int Number of bootstrap iterations for confidence intervals and p-values estimation. The greater, the slower. seed : int or None Random state seed. return_dist : bool If True, the function also returns the indirect bootstrapped beta samples (size = n_boot). Can be plotted for instance using :py:func:`seaborn.distplot()` or :py:func:`seaborn.kdeplot()` functions. Returns ------- stats : pd.DataFrame Mediation summary:: 'path' : regression model 'coef' : regression estimates 'se' : standard error 'CI[2.5%]' : lower confidence interval 'CI[97.5%]' : upper confidence interval 'pval' : two-sided p-values 'sig' : statistical significance Notes ----- Mediation analysis is a "statistical procedure to test whether the effect of an independent variable X on a dependent variable Y (i.e., X → Y) is at least partly explained by a chain of effects of the independent variable on an intervening mediator variable M and of the intervening variable on the dependent variable (i.e., X → M → Y)" (from Fiedler et al. 2011). The **indirect effect** (also referred to as average causal mediation effect or ACME) of X on Y through mediator M quantifies the estimated difference in Y resulting from a one-unit change in X through a sequence of causal steps in which X affects M, which in turn affects Y. It is considered significant if the specified confidence interval does not include 0. The path 'X --> Y' is the sum of both the indirect and direct effect. It is sometimes referred to as total effect. For more details, please refer to Fiedler et al 2011 or Hayes and Rockwood 2017. A linear regression is used if the mediator variable is continuous and a logistic regression if the mediator variable is dichotomous (binary). Note that this function also supports parallel multiple mediators: "in such models, mediators may be and often are correlated, but nothing in the model allows one mediator to causally influence another." (Hayes and Rockwood 2017) This function wll only work well if the outcome variable is continuous. It does not support binary or ordinal outcome variable. For more advanced mediation models, please refer to the `lavaan` or `mediation` R packages, or the PROCESS macro for SPSS. The two-sided p-value of the indirect effect is computed using the bootstrap distribution, as in the mediation R package. However, the p-value should be interpreted with caution since it is a) not constructed conditioned on a true null hypothesis (see Hayes and Rockwood 2017) and b) varies depending on the number of bootstrap samples and the random seed. Note that rows with NaN are automatically removed. Results have been tested against the R mediation package and this tutorial https://data.library.virginia.edu/introduction-to-mediation-analysis/ References ---------- .. [1] Baron, R. M. & Kenny, D. A. The moderator–mediator variable distinction in social psychological research: Conceptual, strategic, and statistical considerations. J. Pers. Soc. Psychol. 51, 1173–1182 (1986). .. [2] Fiedler, K., Schott, M. & Meiser, T. What mediation analysis can (not) do. J. Exp. Soc. Psychol. 47, 1231–1236 (2011). .. [3] Hayes, A. F. & Rockwood, N. J. Regression-based statistical mediation and moderation analysis in clinical research: Observations, recommendations, and implementation. Behav. Res. Ther. 98, 39–57 (2017). .. [4] https://cran.r-project.org/web/packages/mediation/mediation.pdf .. [5] http://lavaan.ugent.be/tutorial/mediation.html .. [6] https://github.com/rmill040/pymediation Examples -------- 1. Simple mediation analysis >>> from pingouin import mediation_analysis, read_dataset >>> df = read_dataset('mediation') >>> mediation_analysis(data=df, x='X', m='M', y='Y', alpha=0.05, seed=42) path coef se pval CI[2.5%] CI[97.5%] sig 0 M ~ X 0.5610 0.0945 4.391362e-08 0.3735 0.7485 Yes 1 Y ~ M 0.6542 0.0858 1.612674e-11 0.4838 0.8245 Yes 2 Total 0.3961 0.1112 5.671128e-04 0.1755 0.6167 Yes 3 Direct 0.0396 0.1096 7.187429e-01 -0.1780 0.2572 No 4 Indirect 0.3565 0.0833 0.000000e+00 0.2198 0.5377 Yes 2. Return the indirect bootstrapped beta coefficients >>> stats, dist = mediation_analysis(data=df, x='X', m='M', y='Y', ... return_dist=True) >>> print(dist.shape) (500,) 3. Mediation analysis with a binary mediator variable >>> mediation_analysis(data=df, x='X', m='Mbin', y='Y', seed=42) path coef se pval CI[2.5%] CI[97.5%] sig 0 Mbin ~ X -0.0205 0.1159 0.859392 -0.2476 0.2066 No 1 Y ~ Mbin -0.1354 0.4118 0.743076 -0.9525 0.6818 No 2 Total 0.3961 0.1112 0.000567 0.1755 0.6167 Yes 3 Direct 0.3956 0.1117 0.000614 0.1739 0.6173 Yes 4 Indirect 0.0023 0.0495 0.960000 -0.0715 0.1441 No 4. Mediation analysis with covariates >>> mediation_analysis(data=df, x='X', m='M', y='Y', ... covar=['Mbin', 'Ybin'], seed=42) path coef se pval CI[2.5%] CI[97.5%] sig 0 M ~ X 0.5594 0.0968 9.394635e-08 0.3672 0.7516 Yes 1 Y ~ M 0.6660 0.0861 1.017261e-11 0.4951 0.8368 Yes 2 Total 0.4204 0.1129 3.324252e-04 0.1962 0.6446 Yes 3 Direct 0.0645 0.1104 5.608583e-01 -0.1548 0.2837 No 4 Indirect 0.3559 0.0865 0.000000e+00 0.2093 0.5530 Yes 5. Mediation analysis with multiple parallel mediators >>> mediation_analysis(data=df, x='X', m=['M', 'Mbin'], y='Y', seed=42) path coef se pval CI[2.5%] CI[97.5%] sig 0 M ~ X 0.5610 0.0945 4.391362e-08 0.3735 0.7485 Yes 1 Mbin ~ X -0.0051 0.0290 8.592408e-01 -0.0626 0.0523 No 2 Y ~ M 0.6537 0.0863 2.118163e-11 0.4824 0.8250 Yes 3 Y ~ Mbin -0.0640 0.3282 8.456998e-01 -0.7154 0.5873 No 4 Total 0.3961 0.1112 5.671128e-04 0.1755 0.6167 Yes 5 Direct 0.0395 0.1102 7.206301e-01 -0.1792 0.2583 No 6 Indirect M 0.3563 0.0845 0.000000e+00 0.2148 0.5385 Yes 7 Indirect Mbin 0.0003 0.0097 9.520000e-01 -0.0172 0.0252 No """ # Sanity check assert isinstance(x, str), 'y must be a string.' assert isinstance(y, str), 'y must be a string.' assert isinstance(m, (list, str)), 'Mediator(s) must be a list or string.' assert isinstance(covar, (type(None), str, list)) if isinstance(m, str): m = [m] n_mediator = len(m) assert isinstance(data, pd.DataFrame), 'Data must be a DataFrame.' # Check for duplicates assert n_mediator == len(set(m)), 'Cannot have duplicates mediators.' if isinstance(covar, str): covar = [covar] if isinstance(covar, list): assert len(covar) == len(set(covar)), 'Cannot have duplicates covar.' assert set(m).isdisjoint(covar), 'Mediator cannot be in covar.' # Check that columns are in dataframe columns = _fl([x, m, y, covar]) keys = data.columns assert all([c in keys for c in columns]), 'Column(s) are not in DataFrame.' # Check that columns are numeric err_msg = "Columns must be numeric or boolean." assert all([data[c].dtype.kind in 'bfi' for c in columns]), err_msg # Drop rows with NAN Values data = data[columns].dropna() n = data.shape[0] assert n > 5, 'DataFrame must have at least 5 samples (rows).' # Check if mediator is binary mtype = 'logistic' if all(data[m].nunique() == 2) else 'linear' # Name of CI ll_name = 'CI[%.1f%%]' % (100 * alpha / 2) ul_name = 'CI[%.1f%%]' % (100 * (1 - alpha / 2)) # Compute regressions cols = ['names', 'coef', 'se', 'pval', ll_name, ul_name] # For speed, we pass np.array instead of pandas DataFrame X_val = data[_fl([x, covar])].values # X + covar as predictors XM_val = data[_fl([x, m, covar])].values # X + M + covar as predictors M_val = data[m].values # M as target (no covariates) y_val = data[y].values # y as target (no covariates) # M(j) ~ X + covar sxm = {} for idx, j in enumerate(m): if mtype == 'linear': sxm[j] = linear_regression(X_val, M_val[:, idx], alpha=alpha).loc[[1], cols] else: sxm[j] = logistic_regression(X_val, M_val[:, idx], alpha=alpha).loc[[1], cols] sxm[j].loc[1, 'names'] = '%s ~ X' % j sxm = pd.concat(sxm, ignore_index=True) # Y ~ M + covar smy = linear_regression(data[_fl([m, covar])], y_val, alpha=alpha).loc[1:n_mediator, cols] # Average Total Effects (Y ~ X + covar) sxy = linear_regression(X_val, y_val, alpha=alpha).loc[[1], cols] # Average Direct Effects (Y ~ X + M + covar) direct = linear_regression(XM_val, y_val, alpha=alpha).loc[[1], cols] # Rename paths smy['names'] = smy['names'].apply(lambda x: 'Y ~ %s' % x) direct.loc[1, 'names'] = 'Direct' sxy.loc[1, 'names'] = 'Total' # Concatenate and create sig column stats = pd.concat((sxm, smy, sxy, direct), ignore_index=True) stats['sig'] = np.where(stats['pval'] < alpha, 'Yes', 'No') # Bootstrap confidence intervals rng = np.random.RandomState(seed) idx = rng.choice(np.arange(n), replace=True, size=(n_boot, n)) ab_estimates = np.zeros(shape=(n_boot, n_mediator)) for i in range(n_boot): ab_estimates[i, :] = _point_estimate(X_val, XM_val, M_val, y_val, idx[i, :], n_mediator, mtype) ab = _point_estimate(X_val, XM_val, M_val, y_val, np.arange(n), n_mediator, mtype) indirect = { 'names': m, 'coef': ab, 'se': ab_estimates.std(ddof=1, axis=0), 'pval': [], ll_name: [], ul_name: [], 'sig': [] } for j in range(n_mediator): ci_j = _bca(ab_estimates[:, j], indirect['coef'][j], alpha=alpha, n_boot=n_boot) indirect[ll_name].append(min(ci_j)) indirect[ul_name].append(max(ci_j)) # Bootstrapped p-value of indirect effect # Note that this is less accurate than a permutation test because the # bootstrap distribution is not conditioned on a true null hypothesis. # For more details see Hayes and Rockwood 2017 indirect['pval'].append( _pval_from_bootci(ab_estimates[:, j], indirect['coef'][j])) indirect['sig'].append('Yes' if indirect['pval'][j] < alpha else 'No') # Create output dataframe indirect = pd.DataFrame.from_dict(indirect) if n_mediator == 1: indirect['names'] = 'Indirect' else: indirect['names'] = indirect['names'].apply( lambda x: 'Indirect %s' % x) stats = stats.append(indirect, ignore_index=True) stats = stats.rename(columns={'names': 'path'}) # Round col_to_round = ['coef', 'se', ll_name, ul_name] stats[col_to_round] = stats[col_to_round].round(4) if return_dist: return stats, np.squeeze(ab_estimates) else: return stats