def test__postprocess_dataframe(self): """Test function _postprocess_dataframe.""" df2 = df.copy() # add some more values and give a stringy index df2.Values = [1.54321, 5.87654, 8.23456, 3.45678] df2 = df2.assign(Values2=[1.54321, 5.87654, 8.23456, 3.45678]) df2.index = ['row' + str(x) for x in df.index] # set rounding options (keeping original options dict to restore after) old_opts = pingouin.options.copy() pingouin.options.clear() pingouin.options['round'] = 4 pingouin.options['round.cell.[row0]x[Values]'] = None pingouin.options['round.column.Values'] = 3 pingouin.options['round.row.row1'] = 2 pingouin.options['round.cell.[row3]x[Values2]'] = 0 df_expected = df2.copy() df_expected.Values = [1.54321, 5.877, 8.235, 3.457] df_expected.Values2 = [1.5432, 5.88, 8.2346, 3.0] df2 = _postprocess_dataframe(df2) pd.testing.assert_frame_equal(df2, df_expected) # restore old options pingouin.options.update(old_opts)
def multivariate_ttest(X, Y=None, paired=False): """Hotelling T-squared test (= multivariate T-test) Parameters ---------- X : np.array First data matrix of shape (n_samples, n_features). Y : np.array or None Second data matrix of shape (n_samples, n_features). If ``Y`` is a 1D array of shape (n_features), a one-sample test is performed where the null hypothesis is defined in ``Y``. If ``Y`` is None, a one-sample is performed against np.zeros(n_features). paired : boolean Specify whether the two observations are related (i.e. repeated measures) or independent. If ``paired`` is True, ``X`` and ``Y`` must have exactly the same shape. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'T2'``: T-squared value * ``'F'``: F-value * ``'df1'``: first degree of freedom * ``'df2'``: second degree of freedom * ``'p-val'``: p-value See Also -------- multivariate_normality : Multivariate normality test. ttest : Univariate T-test. Notes ----- The Hotelling 's T-squared test [1]_ is the multivariate counterpart of the T-test. Rows with missing values are automatically removed using the :py:func:`remove_na` function. Tested against the `Hotelling <https://cran.r-project.org/web/packages/Hotelling/Hotelling.pdf>`_ R package. References ---------- .. [1] Hotelling, H. The Generalization of Student's Ratio. Ann. Math. Statist. 2 (1931), no. 3, 360--378. See also http://www.real-statistics.com/multivariate-statistics/ Examples -------- Two-sample independent Hotelling T-squared test >>> import pingouin as pg >>> data = pg.read_dataset('multivariate') >>> dvs = ['Fever', 'Pressure', 'Aches'] >>> X = data[data['Condition'] == 'Drug'][dvs] >>> Y = data[data['Condition'] == 'Placebo'][dvs] >>> pg.multivariate_ttest(X, Y) T2 F df1 df2 pval hotelling 4.228679 1.326644 3 32 0.282898 Two-sample paired Hotelling T-squared test >>> pg.multivariate_ttest(X, Y, paired=True) T2 F df1 df2 pval hotelling 4.468456 1.314252 3 15 0.306542 One-sample Hotelling T-squared test with a specified null hypothesis >>> null_hypothesis_means = [37.5, 70, 5] >>> pg.multivariate_ttest(X, Y=null_hypothesis_means) T2 F df1 df2 pval hotelling 253.230991 74.479703 3 15 3.081281e-09 """ from scipy.stats import f x = np.asarray(X) assert x.ndim == 2, 'x must be of shape (n_samples, n_features)' if Y is None: y = np.zeros(x.shape[1]) # Remove rows with missing values in x x = x[~np.isnan(x).any(axis=1)] else: nx, kx = x.shape y = np.asarray(Y) assert y.ndim in [1, 2], 'Y must be 1D or 2D.' if y.ndim == 1: # One sample with specified null assert y.size == kx else: # Two-sample err = 'X and Y must have the same number of features (= columns).' assert y.shape[1] == kx, err if paired: err = 'X and Y must have the same number of rows if paired.' assert y.shape[0] == nx, err # Remove rows with missing values in both x and y x, y = remove_na(x, y, paired=paired, axis='rows') # Shape of arrays nx, k = x.shape ny = y.shape[0] assert nx >= 5, 'At least five samples are required.' if y.ndim == 1 or paired is True: n = nx if y.ndim == 1: # One sample test cov = np.cov(x, rowvar=False) diff = x.mean(0) - y else: # Paired two sample cov = np.cov(x - y, rowvar=False) diff = x.mean(0) - y.mean(0) inv_cov = np.linalg.pinv(cov) t2 = (diff @ inv_cov) @ diff * n else: n = nx + ny - 1 x_cov = np.cov(x, rowvar=False) y_cov = np.cov(y, rowvar=False) pooled_cov = ((nx - 1) * x_cov + (ny - 1) * y_cov) / (n - 1) inv_cov = np.linalg.pinv((1 / nx + 1 / ny) * pooled_cov) diff = x.mean(0) - y.mean(0) t2 = (diff @ inv_cov) @ diff # F-value, degrees of freedom and p-value fval = t2 * (n - k) / (k * (n - 1)) df1 = k df2 = n - k pval = f.sf(fval, df1, df2) # Create output dictionnary stats = {'T2': t2, 'F': fval, 'df1': df1, 'df2': df2, 'pval': pval} stats = pd.DataFrame(stats, index=['hotelling']) return _postprocess_dataframe(stats)
def tost(x, y, bound=1, paired=False, correction=False): """Two One-Sided Test (TOST) for equivalence. Parameters ---------- x, y : array_like First and second set of observations. ``x`` and ``y`` should have the same units. If ``y`` is a single value (e.g. 0), a one-sample test is performed. bound : float Magnitude of region of similarity (a.k.a epsilon). Note that this should be expressed in the same unit as ``x`` and ``y``. paired : boolean Specify whether the two observations are related (i.e. repeated measures) or independent. correction : auto or boolean Specify whether or not to correct for unequal variances using Welch separate variances T-test. This only applies if ``paired`` is False. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'bound'``: bound (= epsilon, or equivalence margin) * ``'dof'``: degrees of freedom * ``'pval'``: TOST p-value See also -------- ttest References ---------- .. [1] Schuirmann, D.L. 1981. On hypothesis testing to determine if the mean of a normal distribution is contained in a known interval. Biometrics 37 617. .. [2] https://cran.r-project.org/web/packages/equivalence/equivalence.pdf Examples -------- 1. Independent two-sample TOST with a region of similarity of 1 (default) >>> import pingouin as pg >>> a = [4, 7, 8, 6, 3, 2] >>> b = [6, 8, 7, 10, 11, 9] >>> pg.tost(a, b) bound dof pval TOST 1 10 0.965097 2. Paired TOST with a different region of similarity >>> pg.tost(a, b, bound=0.5, paired=True) bound dof pval TOST 0.5 5 0.954854 3. One sample TOST >>> pg.tost(a, y=0, bound=4) bound dof pval TOST 4 5 0.825967 """ x = np.asarray(x) y = np.asarray(y) assert isinstance(bound, (int, float)), 'bound must be int or float.' # T-tests df_a = ttest(x + bound, y, paired=paired, correction=correction, alternative='greater') df_b = ttest(x - bound, y, paired=paired, correction=correction, alternative='less') pval = max(df_a.at['T-test', 'p-val'], df_b.at['T-test', 'p-val']) # Create output dataframe stats = pd.DataFrame( { 'bound': bound, 'dof': df_a.at['T-test', 'dof'], 'pval': pval }, index=['TOST']) return _postprocess_dataframe(stats)
def corr(x, y, tail='two-sided', method='pearson', **kwargs): """(Robust) correlation between two variables. Parameters ---------- x, y : array_like First and second set of observations. ``x`` and ``y`` must be independent. tail : string Specify whether to return ``'one-sided'`` or ``'two-sided'`` p-value. Note that the former are simply half the latter. method : string Correlation type: * ``'pearson'``: Pearson :math:`r` product-moment correlation * ``'spearman'``: Spearman :math:`\\rho` rank-order correlation * ``'kendall'``: Kendall's :math:`\\tau_B` correlation (for ordinal data) * ``'bicor'``: Biweight midcorrelation (robust) * ``'percbend'``: Percentage bend correlation (robust) * ``'shepherd'``: Shepherd's pi correlation (robust) * ``'skipped'``: Skipped correlation (robust) **kwargs : optional Optional argument(s) passed to the lower-level functions. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'n'``: Sample size (after removal of missing values) * ``'outliers'``: number of outliers, only if a robust method was used * ``'r'``: Correlation coefficient * ``'CI95'``: 95% parametric confidence intervals around :math:`r` * ``'r2'``: R-squared (:math:`= r^2`) * ``'adj_r2'``: Adjusted R-squared * ``'p-val'``: tail of the test * ``'BF10'``: Bayes Factor of the alternative hypothesis (only for Pearson correlation) * ``'power'``: achieved power of the test (= 1 - type II error). See also -------- pairwise_corr : Pairwise correlation between columns of a pandas DataFrame partial_corr : Partial correlation rm_corr : Repeated measures correlation Notes ----- The `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_ measures the linear relationship between two datasets. Strictly speaking, Pearson's correlation requires that each dataset be normally distributed. Correlations of -1 or +1 imply a perfect negative and positive linear relationship, respectively, with 0 indicating the absence of association. .. math:: r_{xy} = \\frac{\\sum_i(x_i - \\bar{x})(y_i - \\bar{y})} {\\sqrt{\\sum_i(x_i - \\bar{x})^2} \\sqrt{\\sum_i(y_i - \\bar{y})^2}} = \\frac{\\text{cov}(x, y)}{\\sigma_x \\sigma_y} where :math:`\\text{cov}` is the sample covariance and :math:`\\sigma` is the sample standard deviation. If ``method='pearson'``, The Bayes Factor is calculated using the :py:func:`pingouin.bayesfactor_pearson` function. The `Spearman correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_ is a non-parametric measure of the monotonicity of the relationship between two datasets. Unlike the Pearson correlation, the Spearman correlation does not assume that both datasets are normally distributed. Correlations of -1 or +1 imply an exact negative and positive monotonic relationship, respectively. Mathematically, the Spearman correlation coefficient is defined as the Pearson correlation coefficient between the `rank variables <https://en.wikipedia.org/wiki/Ranking>`_. The `Kendall correlation coefficient <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_ is a measure of the correspondence between two rankings. Values also range from -1 (perfect disagreement) to 1 (perfect agreement), with 0 indicating the absence of association. Consistent with :py:func:`scipy.stats.kendalltau`, Pingouin returns the Tau-b coefficient, which adjusts for ties: .. math:: \\tau_B = \\frac{(P - Q)}{\\sqrt{(P + Q + T) (P + Q + U)}} where :math:`P` is the number of concordant pairs, :math:`Q` the number of discordand pairs, :math:`T` the number of ties in x, and :math:`U` the number of ties in y. The `biweight midcorrelation <https://en.wikipedia.org/wiki/Biweight_midcorrelation>`_ and percentage bend correlation [1]_ are both robust methods that protects against *univariate* outliers by down-weighting observations that deviate too much from the median. The Shepherd pi [2]_ correlation and skipped [3]_, [4]_ correlation are both robust methods that returns the Spearman correlation coefficient after removing *bivariate* outliers. Briefly, the Shepherd pi uses a bootstrapping of the Mahalanobis distance to identify outliers, while the skipped correlation is based on the minimum covariance determinant (which requires scikit-learn). Note that these two methods are significantly slower than the previous ones. .. important:: Please note that rows with missing values (NaN) are automatically removed. References ---------- .. [1] Wilcox, R.R., 1994. The percentage bend correlation coefficient. Psychometrika 59, 601–616. https://doi.org/10.1007/BF02294395 .. [2] Schwarzkopf, D.S., De Haas, B., Rees, G., 2012. Better ways to improve standards in brain-behavior correlation analysis. Front. Hum. Neurosci. 6, 200. https://doi.org/10.3389/fnhum.2012.00200 .. [3] Rousselet, G.A., Pernet, C.R., 2012. Improving standards in brain-behavior correlation analyses. Front. Hum. Neurosci. 6, 119. https://doi.org/10.3389/fnhum.2012.00119 .. [4] Pernet, C.R., Wilcox, R., Rousselet, G.A., 2012. Robust correlation analyses: false positive and power validation using a new open source matlab toolbox. Front. Psychol. 3, 606. https://doi.org/10.3389/fpsyg.2012.00606 Examples -------- 1. Pearson correlation >>> import numpy as np >>> import pingouin as pg >>> # Generate random correlated samples >>> np.random.seed(123) >>> mean, cov = [4, 6], [(1, .5), (.5, 1)] >>> x, y = np.random.multivariate_normal(mean, cov, 30).T >>> # Compute Pearson correlation >>> pg.corr(x, y).round(3) n r CI95% r2 adj_r2 p-val BF10 power pearson 30 0.491 [0.16, 0.72] 0.242 0.185 0.006 8.55 0.809 2. Pearson correlation with two outliers >>> x[3], y[5] = 12, -8 >>> pg.corr(x, y).round(3) n r CI95% r2 adj_r2 p-val BF10 power pearson 30 0.147 [-0.23, 0.48] 0.022 -0.051 0.439 0.302 0.121 3. Spearman correlation (robust to outliers) >>> pg.corr(x, y, method="spearman").round(3) n r CI95% r2 adj_r2 p-val power spearman 30 0.401 [0.05, 0.67] 0.161 0.099 0.028 0.61 4. Biweight midcorrelation (robust) >>> pg.corr(x, y, method="bicor").round(3) n r CI95% r2 adj_r2 p-val power bicor 30 0.393 [0.04, 0.66] 0.155 0.092 0.031 0.592 5. Percentage bend correlation (robust) >>> pg.corr(x, y, method='percbend').round(3) n r CI95% r2 adj_r2 p-val power percbend 30 0.389 [0.03, 0.66] 0.151 0.089 0.034 0.581 6. Shepherd's pi correlation (robust) >>> pg.corr(x, y, method='shepherd').round(3) n outliers r CI95% r2 adj_r2 p-val power shepherd 30 2 0.437 [0.09, 0.69] 0.191 0.131 0.02 0.694 7. Skipped spearman correlation (robust) >>> pg.corr(x, y, method='skipped').round(3) n outliers r CI95% r2 adj_r2 p-val power skipped 30 2 0.437 [0.09, 0.69] 0.191 0.131 0.02 0.694 8. One-tailed Pearson correlation >>> pg.corr(x, y, tail="one-sided", method='pearson').round(3) n r CI95% r2 adj_r2 p-val BF10 power pearson 30 0.147 [-0.23, 0.48] 0.022 -0.051 0.22 0.467 0.194 9. Using columns of a pandas dataframe >>> import pandas as pd >>> data = pd.DataFrame({'x': x, 'y': y}) >>> pg.corr(data['x'], data['y']).round(3) n r CI95% r2 adj_r2 p-val BF10 power pearson 30 0.147 [-0.23, 0.48] 0.022 -0.051 0.439 0.302 0.121 """ # Safety check x = np.asarray(x) y = np.asarray(y) assert x.ndim == y.ndim == 1, 'x and y must be 1D array.' assert x.size == y.size, 'x and y must have the same length.' _msg = 'tail must be "two-sided" or "one-sided".' assert tail in ['two-sided', 'one-sided'], _msg # Remove rows with missing values x, y = remove_na(x, y, paired=True) nx = x.size # Compute correlation coefficient if method == 'pearson': r, pval = pearsonr(x, y) elif method == 'spearman': r, pval = spearmanr(x, y, **kwargs) elif method == 'kendall': r, pval = kendalltau(x, y, **kwargs) elif method == 'bicor': r, pval = bicor(x, y, **kwargs) elif method == 'percbend': r, pval = percbend(x, y, **kwargs) elif method == 'shepherd': r, pval, outliers = shepherd(x, y, **kwargs) elif method == 'skipped': r, pval, outliers = skipped(x, y, **kwargs) else: raise ValueError(f'Method "{method}" not recognized.') if np.isnan(r): # Correlation failed -- new in version v0.3.4, instead of raising an # error we just return a dataframe full of NaN (except sample size). # This avoid sudden stop in pingouin.pairwise_corr. return pd.DataFrame({'n': nx, 'r': np.nan, 'CI95%': np.nan, 'r2': np.nan, 'adj_r2': np.nan, 'p-val': np.nan, 'BF10': np.nan, 'power': np.nan}, index=[method]) # Compute r2 and adj_r2 r2 = r**2 adj_r2 = 1 - (((1 - r2) * (nx - 1)) / (nx - 3)) # Compute the parametric 95% confidence interval and power ci = compute_esci(stat=r, nx=nx, ny=nx, eftype='r', decimals=6) pr = power_corr(r=r, n=nx, power=None, alpha=0.05, tail=tail), # Create dictionnary stats = {'n': nx, 'r': r, 'r2': r2, 'adj_r2': adj_r2, 'CI95%': [ci], 'p-val': pval if tail == 'two-sided' else .5 * pval, 'power': pr } if method in ['shepherd', 'skipped']: stats['outliers'] = sum(outliers) # Compute the BF10 for Pearson correlation only if method == 'pearson': stats['BF10'] = bayesfactor_pearson(r, nx, tail=tail) # Convert to DataFrame stats = pd.DataFrame.from_records(stats, index=[method]) # Define order col_keep = ['n', 'outliers', 'r', 'CI95%', 'r2', 'adj_r2', 'p-val', 'BF10', 'power'] col_order = [k for k in col_keep if k in stats.keys().tolist()] return _postprocess_dataframe(stats)[col_order]
def rm_corr(data=None, x=None, y=None, subject=None, tail='two-sided'): """Repeated measures correlation. Parameters ---------- data : :py:class:`pandas.DataFrame` Dataframe. x, y : string Name of columns in ``data`` containing the two dependent variables. subject : string Name of column in ``data`` containing the subject indicator. tail : string Specify whether to return 'one-sided' or 'two-sided' p-value. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'r'``: Repeated measures correlation coefficient * ``'dof'``: Degrees of freedom * ``'pval'``: one or two tailed p-value * ``'CI95'``: 95% parametric confidence intervals * ``'power'``: achieved power of the test (= 1 - type II error). See also -------- plot_rm_corr Notes ----- Repeated measures correlation (rmcorr) is a statistical technique for determining the common within-individual association for paired measures assessed on two or more occasions for multiple individuals. From `Bakdash and Marusich (2017) <https://doi.org/10.3389/fpsyg.2017.00456>`_: *Rmcorr accounts for non-independence among observations using analysis of covariance (ANCOVA) to statistically adjust for inter-individual variability. By removing measured variance between-participants, rmcorr provides the best linear fit for each participant using parallel regression lines (the same slope) with varying intercepts. Like a Pearson correlation coefficient, the rmcorr coefficient is bounded by − 1 to 1 and represents the strength of the linear association between two variables.* Results have been tested against the `rmcorr <https://github.com/cran/rmcorr>`_ R package. Please note that missing values are automatically removed from the dataframe (listwise deletion). Examples -------- >>> import pingouin as pg >>> df = pg.read_dataset('rm_corr') >>> pg.rm_corr(data=df, x='pH', y='PacO2', subject='Subject') r dof pval CI95% power rm_corr -0.50677 38 0.000847 [-0.71, -0.23] 0.929579 Now plot using the :py:func:`pingouin.plot_rm_corr` function: .. plot:: >>> import pingouin as pg >>> df = pg.read_dataset('rm_corr') >>> g = pg.plot_rm_corr(data=df, x='pH', y='PacO2', subject='Subject') """ from pingouin import ancova, power_corr # Safety checks assert isinstance(data, pd.DataFrame), 'Data must be a DataFrame' assert x in data.columns, 'The %s column is not in data.' % x assert y in data.columns, 'The %s column is not in data.' % y assert data[x].dtype.kind in 'bfiu', '%s must be numeric.' % x assert data[y].dtype.kind in 'bfiu', '%s must be numeric.' % y assert subject in data.columns, 'The %s column is not in data.' % subject if data[subject].nunique() < 3: raise ValueError('rm_corr requires at least 3 unique subjects.') # Remove missing values data = data[[x, y, subject]].dropna(axis=0) # Using PINGOUIN # For max precision, make sure rounding is disabled old_options = options.copy() options['round'] = None aov = ancova(dv=y, covar=x, between=subject, data=data) options.update(old_options) # restore options bw = aov.bw_ # Beta within parameter sign = np.sign(bw) dof = int(aov.at[2, 'DF']) n = dof + 2 ssfactor = aov.at[1, 'SS'] sserror = aov.at[2, 'SS'] rm = sign * np.sqrt(ssfactor / (ssfactor + sserror)) pval = aov.at[1, 'p-unc'] pval = pval * 0.5 if tail == 'one-sided' else pval ci = compute_esci(stat=rm, nx=n, eftype='pearson').tolist() pwr = power_corr(r=rm, n=n, tail=tail) # Convert to Dataframe stats = pd.DataFrame({"r": rm, "dof": int(dof), "pval": pval, "CI95%": [ci], "power": pwr}, index=["rm_corr"]) return _postprocess_dataframe(stats)
def partial_corr(data=None, x=None, y=None, covar=None, x_covar=None, y_covar=None, tail='two-sided', method='pearson', **kwargs): """Partial and semi-partial correlation. Parameters ---------- data : :py:class:`pandas.DataFrame` Panddas Dataframe. Note that this function can also directly be used as a :py:class:`pandas.DataFrame` method, in which case this argument is no longer needed. x, y : string x and y. Must be names of columns in ``data``. covar : string or list Covariate(s). Must be a names of columns in ``data``. Use a list if there are two or more covariates. x_covar : string or list Covariate(s) for the ``x`` variable. This is used to compute semi-partial correlation (i.e. the effect of ``x_covar`` is removed from ``x`` but not from ``y``). Only one of ``covar``, ``x_covar`` and ``y_covar`` can be specified. y_covar : string or list Covariate(s) for the ``y`` variable. This is used to compute semi-partial correlation (i.e. the effect of ``y_covar`` is removed from ``y`` but not from ``x``). Only one of ``covar``, ``x_covar`` and ``y_covar`` can be specified. tail : string Specify whether to return `'one-sided'` or `'two-sided'` p-value. The former are simply half the latter. method : string Correlation type: * ``'pearson'``: Pearson :math:`r` product-moment correlation * ``'spearman'``: Spearman :math:`\\rho` rank-order correlation * ``'kendall'``: Kendall's :math:`\\tau_B` correlation (for ordinal data) * ``'bicor'``: Biweight midcorrelation (robust) * ``'percbend'``: Percentage bend correlation (robust) * ``'shepherd'``: Shepherd's pi correlation (robust) * ``'skipped'``: Skipped correlation (robust) **kwargs : optional Optional argument(s) passed to the lower-level correlation functions. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'n'``: Sample size (after removal of missing values) * ``'outliers'``: number of outliers, only if a robust method was used * ``'r'``: Correlation coefficient * ``'CI95'``: 95% parametric confidence intervals around :math:`r` * ``'p-val'``: tail of the test See also -------- corr, pairwise_corr, rm_corr Notes ----- From [1]_: *With partial correlation, we find the correlation between x and y holding C constant for both x and y. Sometimes, however, we want to hold C constant for just x or just y. In that case, we compute a semi-partial correlation. A partial correlation is computed between two residuals. A semi-partial correlation is computed between one residual and another raw (or unresidualized) variable.* Note that if you are not interested in calculating the p-values [2]_ but only the partial correlation matrix, a faster alternative is to use :py:func:`pingouin.pcorr` (see example 4). Rows with missing values are automatically removed from data. Results have been tested against the `ppcor <https://cran.r-project.org/web/packages/ppcor/index.html>`_ R package. References ---------- .. [1] http://faculty.cas.usf.edu/mbrannick/regression/Partial.html .. [2] https://online.stat.psu.edu/stat505/lesson/6/6.3 Examples -------- 1. Partial correlation with one covariate >>> import pingouin as pg >>> df = pg.read_dataset('partial_corr') >>> pg.partial_corr(data=df, x='x', y='y', covar='cv1').round(3) n r CI95% p-val pearson 30 0.568 [0.25, 0.77] 0.001 2. Spearman partial correlation with several covariates >>> # Partial correlation of x and y controlling for cv1, cv2 and cv3 >>> pg.partial_corr(data=df, x='x', y='y', covar=['cv1', 'cv2', 'cv3'], ... method='spearman').round(3) n r CI95% p-val spearman 30 0.491 [0.14, 0.73] 0.009 3. As a pandas method >>> df.partial_corr(x='x', y='y', covar=['cv1'], ... method='spearman').round(3) n r CI95% p-val spearman 30 0.568 [0.26, 0.77] 0.001 4. Partial correlation matrix (returns only the correlation coefficients) >>> df.pcorr().round(3) x y cv1 cv2 cv3 x 1.000 0.493 -0.095 0.130 -0.385 y 0.493 1.000 -0.007 0.104 -0.002 cv1 -0.095 -0.007 1.000 -0.241 -0.470 cv2 0.130 0.104 -0.241 1.000 -0.118 cv3 -0.385 -0.002 -0.470 -0.118 1.000 5. Semi-partial correlation on x >>> pg.partial_corr(data=df, x='x', y='y', ... x_covar=['cv1', 'cv2', 'cv3']).round(3) n r CI95% p-val pearson 30 0.463 [0.1, 0.72] 0.015 """ from pingouin.utils import _flatten_list # Safety check assert tail in ['two-sided', 'one-sided'], ( 'tail must be "two-sided" or "one-sided".') assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.' assert data.shape[0] > 2, 'Data must have at least 3 samples.' assert isinstance(x, (str, tuple)), 'x must be a string.' assert isinstance(y, (str, tuple)), 'y must be a string.' assert isinstance(covar, (str, list, type(None))) assert isinstance(x_covar, (str, list, type(None))) assert isinstance(y_covar, (str, list, type(None))) if covar is not None and (x_covar is not None or y_covar is not None): raise ValueError('Cannot specify both covar and {x,y}_covar.') if x_covar is not None and y_covar is not None: raise ValueError('Cannot specify both x_covar and y_covar.') assert x != covar, 'x and covar must be independent' assert y != covar, 'y and covar must be independent' assert x != y, 'x and y must be independent' if isinstance(covar, list): assert x not in covar, 'x and covar must be independent' assert y not in covar, 'y and covar must be independent' # Check that columns exist col = _flatten_list([x, y, covar, x_covar, y_covar]) if isinstance(covar, str): covar = [covar] if isinstance(x_covar, str): x_covar = [x_covar] if isinstance(y_covar, str): y_covar = [y_covar] assert all([c in data for c in col]), 'columns are not in dataframe.' # Check that columns are numeric assert all([data[c].dtype.kind in 'bfiu' for c in col]) # Drop rows with NaN data = data[col].dropna() n = data.shape[0] # Number of samples k = data.shape[1] - 2 # Number of covariates # dof = n - k - 2 assert n > 2, 'Data must have at least 3 non-NAN samples.' # Standardize (= no need for an intercept in least-square regression) C = (data[col] - data[col].mean(axis=0)) / data[col].std(axis=0) if covar is not None: # PARTIAL CORRELATION cvar = np.atleast_2d(C[covar].to_numpy()) beta_x = np.linalg.lstsq(cvar, C[x].to_numpy(), rcond=None)[0] beta_y = np.linalg.lstsq(cvar, C[y].to_numpy(), rcond=None)[0] res_x = C[x].to_numpy() - cvar @ beta_x res_y = C[y].to_numpy() - cvar @ beta_y else: # SEMI-PARTIAL CORRELATION # Initialize "fake" residuals res_x, res_y = data[x].to_numpy(), data[y].to_numpy() if x_covar is not None: cvar = np.atleast_2d(C[x_covar].to_numpy()) beta_x = np.linalg.lstsq(cvar, C[x].to_numpy(), rcond=None)[0] res_x = C[x].to_numpy() - cvar @ beta_x if y_covar is not None: cvar = np.atleast_2d(C[y_covar].to_numpy()) beta_y = np.linalg.lstsq(cvar, C[y].to_numpy(), rcond=None)[0] res_y = C[y].to_numpy() - cvar @ beta_y # Compute partial correlation coefficient # We do not extract the p-values at this stage because they do not account # for the number of covariates in the degrees of freedom if method == 'pearson': r, _ = pearsonr(res_x, res_y) elif method == 'spearman': r, _ = spearmanr(res_x, res_y, **kwargs) elif method == 'kendall': r, _ = kendalltau(res_x, res_y, **kwargs) elif method == 'bicor': r, _ = bicor(res_x, res_y, **kwargs) elif method == 'percbend': r, _ = percbend(res_x, res_y, **kwargs) elif method == 'shepherd': r, _, outliers = shepherd(res_x, res_y, **kwargs) elif method == 'skipped': r, _, outliers = skipped(res_x, res_y, **kwargs) else: raise ValueError(f'Method "{method}" not recognized.') if np.isnan(r): # Correlation failed -- new in version v0.3.4, instead of raising an # error we just return a dataframe full of NaN (except sample size). # This avoid sudden stop in pingouin.pairwise_corr. return pd.DataFrame({'n': n, 'r': np.nan, 'CI95%': np.nan, 'p-val': np.nan}, index=[method]) # Sample size after outlier removal n_outliers = sum(outliers) if "outliers" in locals() else 0 n_clean = n - n_outliers # Compute the two-sided p-value and confidence intervals # https://online.stat.psu.edu/stat505/lesson/6/6.3 pval = _correl_pvalue(r, n_clean, k) ci = compute_esci( stat=r, nx=(n_clean - k), ny=(n_clean - k), eftype='r', decimals=6) # Create dictionnary stats = { 'n': n, 'r': r, 'CI95%': [ci], 'p-val': pval if tail == 'two-sided' else .5 * pval, } if method in ['shepherd', 'skipped']: stats['outliers'] = n_outliers # Convert to DataFrame stats = pd.DataFrame.from_records(stats, index=[method]) # Define order col_keep = ['n', 'outliers', 'r', 'CI95%', 'p-val'] col_order = [k for k in col_keep if k in stats.keys().tolist()] return _postprocess_dataframe(stats)[col_order]
def partial_corr(data=None, x=None, y=None, covar=None, x_covar=None, y_covar=None, alternative='two-sided', method='pearson'): """Partial and semi-partial correlation. Parameters ---------- data : :py:class:`pandas.DataFrame` Pandas Dataframe. Note that this function can also directly be used as a :py:class:`pandas.DataFrame` method, in which case this argument is no longer needed. x, y : string x and y. Must be names of columns in ``data``. covar : string or list Covariate(s). Must be a names of columns in ``data``. Use a list if there are two or more covariates. x_covar : string or list Covariate(s) for the ``x`` variable. This is used to compute semi-partial correlation (i.e. the effect of ``x_covar`` is removed from ``x`` but not from ``y``). Only one of ``covar``, ``x_covar`` and ``y_covar`` can be specified. y_covar : string or list Covariate(s) for the ``y`` variable. This is used to compute semi-partial correlation (i.e. the effect of ``y_covar`` is removed from ``y`` but not from ``x``). Only one of ``covar``, ``x_covar`` and ``y_covar`` can be specified. alternative : string Defines the alternative hypothesis, or tail of the partial correlation. Must be one of "two-sided" (default), "greater" or "less". Both "greater" and "less" return a one-sided p-value. "greater" tests against the alternative hypothesis that the partial correlation is positive (greater than zero), "less" tests against the hypothesis that the partial correlation is negative. method : string Correlation type: * ``'pearson'``: Pearson :math:`r` product-moment correlation * ``'spearman'``: Spearman :math:`\\rho` rank-order correlation Returns ------- stats : :py:class:`pandas.DataFrame` * ``'n'``: Sample size (after removal of missing values) * ``'r'``: Partial correlation coefficient * ``'CI95'``: 95% parametric confidence intervals around :math:`r` * ``'p-val'``: p-value See also -------- corr, pcorr, pairwise_corr, rm_corr Notes ----- Partial correlation [1]_ measures the degree of association between ``x`` and ``y``, after removing the effect of one or more controlling variables (``covar``, or :math:`Z`). Practically, this is achieved by calculating the correlation coefficient between the residuals of two linear regressions: .. math:: x \\sim Z, y \\sim Z Like the correlation coefficient, the partial correlation coefficient takes on a value in the range from –1 to 1, where 1 indicates a perfect positive association. The semipartial correlation is similar to the partial correlation, with the exception that the set of controlling variables is only removed for either ``x`` or ``y``, but not both. Pingouin uses the method described in [2]_ to calculate the (semi)partial correlation coefficients and associated p-values. This method is based on the inverse covariance matrix and is significantly faster than the traditional regression-based method. Results have been tested against the `ppcor <https://cran.r-project.org/web/packages/ppcor/index.html>`_ R package. .. important:: Rows with missing values are automatically removed from data. References ---------- .. [1] https://en.wikipedia.org/wiki/Partial_correlation .. [2] https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4681537/ Examples -------- 1. Partial correlation with one covariate >>> import pingouin as pg >>> df = pg.read_dataset('partial_corr') >>> pg.partial_corr(data=df, x='x', y='y', covar='cv1').round(3) n r CI95% p-val pearson 30 0.568 [0.25, 0.77] 0.001 2. Spearman partial correlation with several covariates >>> # Partial correlation of x and y controlling for cv1, cv2 and cv3 >>> pg.partial_corr(data=df, x='x', y='y', covar=['cv1', 'cv2', 'cv3'], ... method='spearman').round(3) n r CI95% p-val spearman 30 0.521 [0.18, 0.75] 0.005 3. Same but one-sided test >>> pg.partial_corr(data=df, x='x', y='y', covar=['cv1', 'cv2', 'cv3'], ... alternative="greater", method='spearman').round(3) n r CI95% p-val spearman 30 0.521 [0.24, 1.0] 0.003 >>> pg.partial_corr(data=df, x='x', y='y', covar=['cv1', 'cv2', 'cv3'], ... alternative="less", method='spearman').round(3) n r CI95% p-val spearman 30 0.521 [-1.0, 0.72] 0.997 4. As a pandas method >>> df.partial_corr(x='x', y='y', covar=['cv1'], method='spearman').round(3) n r CI95% p-val spearman 30 0.578 [0.27, 0.78] 0.001 5. Partial correlation matrix (returns only the correlation coefficients) >>> df.pcorr().round(3) x y cv1 cv2 cv3 x 1.000 0.493 -0.095 0.130 -0.385 y 0.493 1.000 -0.007 0.104 -0.002 cv1 -0.095 -0.007 1.000 -0.241 -0.470 cv2 0.130 0.104 -0.241 1.000 -0.118 cv3 -0.385 -0.002 -0.470 -0.118 1.000 6. Semi-partial correlation on x >>> pg.partial_corr(data=df, x='x', y='y', x_covar=['cv1', 'cv2', 'cv3']).round(3) n r CI95% p-val pearson 30 0.463 [0.1, 0.72] 0.015 """ from pingouin.utils import _flatten_list # Safety check assert alternative in [ 'two-sided', 'greater', 'less' ], ("Alternative must be one of 'two-sided' (default), 'greater' or 'less'." ) assert method in [ 'pearson', 'spearman' ], ('only "pearson" and "spearman" are supported for partial correlation.') assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.' assert data.shape[0] > 2, 'Data must have at least 3 samples.' if covar is not None and (x_covar is not None or y_covar is not None): raise ValueError('Cannot specify both covar and {x,y}_covar.') if x_covar is not None and y_covar is not None: raise ValueError('Cannot specify both x_covar and y_covar.') assert x != covar, 'x and covar must be independent' assert y != covar, 'y and covar must be independent' assert x != y, 'x and y must be independent' if isinstance(covar, list): assert x not in covar, 'x and covar must be independent' assert y not in covar, 'y and covar must be independent' # Check that columns exist col = _flatten_list([x, y, covar, x_covar, y_covar]) assert all([c in data for c in col]), 'columns are not in dataframe.' # Check that columns are numeric assert all([data[c].dtype.kind in 'bfiu' for c in col]) # Drop rows with NaN data = data[col].dropna() n = data.shape[0] # Number of samples k = data.shape[1] - 2 # Number of covariates assert n > 2, 'Data must have at least 3 non-NAN samples.' # Calculate the partial corrrelation matrix - similar to pingouin.pcorr() if method == "spearman": # Convert the data to rank, similar to R cov() V = data.rank(na_option='keep').cov() else: V = data.cov() Vi = np.linalg.pinv(V, hermitian=True) # Inverse covariance matrix Vi_diag = Vi.diagonal() D = np.diag(np.sqrt(1 / Vi_diag)) pcor = -1 * (D @ Vi @ D) # Partial correlation matrix if covar is not None: r = pcor[0, 1] else: # Semi-partial correlation matrix with np.errstate(divide='ignore'): spcor = pcor / \ np.sqrt(np.diag(V))[..., None] / \ np.sqrt(np.abs(Vi_diag - Vi ** 2 / Vi_diag[..., None])).T if y_covar is not None: r = spcor[0, 1] # y_covar is removed from y else: r = spcor[1, 0] # x_covar is removed from x if np.isnan(r): # Correlation failed. Return NaN. When would this happen? return pd.DataFrame( { 'n': n, 'r': np.nan, 'CI95%': np.nan, 'p-val': np.nan }, index=[method]) # Compute the two-sided p-value and confidence intervals # https://online.stat.psu.edu/stat505/lesson/6/6.3 pval = _correl_pvalue(r, n, k, alternative) ci = compute_esci(stat=r, nx=(n - k), ny=(n - k), eftype='r', decimals=6, alternative=alternative) # Create dictionnary stats = { 'n': n, 'r': r, 'CI95%': [ci], 'p-val': pval, } # Convert to DataFrame stats = pd.DataFrame(stats, index=[method]) # Define order col_keep = ['n', 'r', 'CI95%', 'p-val'] col_order = [k for k in col_keep if k in stats.keys().tolist()] return _postprocess_dataframe(stats)[col_order]
def normality(data, dv=None, group=None, method="shapiro", alpha=.05): """Univariate normality test. Parameters ---------- data : :py:class:`pandas.DataFrame`, series, list or 1D np.array Iterable. Can be either a single list, 1D numpy array, or a wide- or long-format pandas dataframe. dv : str Dependent variable (only when ``data`` is a long-format dataframe). group : str Grouping variable (only when ``data`` is a long-format dataframe). method : str Normality test. `'shapiro'` (default) performs the Shapiro-Wilk test using :py:func:`scipy.stats.shapiro`, and `'normaltest'` performs the omnibus test of normality using :py:func:`scipy.stats.normaltest`. The latter is more appropriate for large samples. alpha : float Significance level. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'W'``: Test statistic. * ``'pval'``: p-value. * ``'normal'``: True if ``data`` is normally distributed. See Also -------- homoscedasticity : Test equality of variance. sphericity : Mauchly's test for sphericity. Notes ----- The Shapiro-Wilk test calculates a :math:`W` statistic that tests whether a random sample :math:`x_1, x_2, ..., x_n` comes from a normal distribution. The :math:`W` statistic is calculated as follows: .. math:: W = \\frac{(\\sum_{i=1}^n a_i x_{i})^2} {\\sum_{i=1}^n (x_i - \\overline{x})^2} where the :math:`x_i` are the ordered sample values (in ascending order) and the :math:`a_i` are constants generated from the means, variances and covariances of the order statistics of a sample of size :math:`n` from a standard normal distribution. Specifically: .. math:: (a_1, ..., a_n) = \\frac{m^TV^{-1}}{(m^TV^{-1}V^{-1}m)^{1/2}} with :math:`m = (m_1, ..., m_n)^T` and :math:`(m_1, ..., m_n)` are the expected values of the order statistics of independent and identically distributed random variables sampled from the standard normal distribution, and :math:`V` is the covariance matrix of those order statistics. The null-hypothesis of this test is that the population is normally distributed. Thus, if the p-value is less than the chosen alpha level (typically set at 0.05), then the null hypothesis is rejected and there is evidence that the data tested are not normally distributed. The result of the Shapiro-Wilk test should be interpreted with caution in the case of large sample sizes. Indeed, quoting from `Wikipedia <https://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test>`_: *"Like most statistical significance tests, if the sample size is sufficiently large this test may detect even trivial departures from the null hypothesis (i.e., although there may be some statistically significant effect, it may be too small to be of any practical significance); thus, additional investigation of the effect size is typically advisable, e.g., a Q–Q plot in this case."* Note that missing values are automatically removed (casewise deletion). References ---------- * Shapiro, S. S., & Wilk, M. B. (1965). An analysis of variance test for normality (complete samples). Biometrika, 52(3/4), 591-611. * https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm Examples -------- 1. Shapiro-Wilk test on a 1D array. >>> import numpy as np >>> import pingouin as pg >>> np.random.seed(123) >>> x = np.random.normal(size=100) >>> pg.normality(x) W pval normal 0 0.98414 0.274886 True 2. Omnibus test on a wide-format dataframe with missing values >>> data = pg.read_dataset('mediation') >>> data.loc[1, 'X'] = np.nan >>> pg.normality(data, method='normaltest').round(3) W pval normal X 1.792 0.408 True M 0.492 0.782 True Y 0.349 0.840 True Mbin 839.716 0.000 False Ybin 814.468 0.000 False W1 24.816 0.000 False W2 43.400 0.000 False 3. Pandas Series >>> pg.normality(data['X'], method='normaltest') W pval normal X 1.791839 0.408232 True 4. Long-format dataframe >>> data = pg.read_dataset('rm_anova2') >>> pg.normality(data, dv='Performance', group='Time') W pval normal Pre 0.967718 0.478773 True Post 0.940728 0.095157 True """ assert isinstance(data, (pd.DataFrame, pd.Series, list, np.ndarray)) assert method in ['shapiro', 'normaltest'] if isinstance(data, pd.Series): data = data.to_frame() col_names = ['W', 'pval'] func = getattr(scipy.stats, method) if isinstance(data, (list, np.ndarray)): data = np.asarray(data) assert data.ndim == 1, 'Data must be 1D.' assert data.size > 3, 'Data must have more than 3 samples.' data = remove_na(data) stats = pd.DataFrame(func(data)).T stats.columns = col_names stats['normal'] = np.where(stats['pval'] > alpha, True, False) else: # Data is a Pandas DataFrame if dv is None and group is None: # Wide-format # Get numeric data only numdata = data._get_numeric_data() stats = numdata.apply(lambda x: func(x.dropna()), result_type='expand', axis=0).T stats.columns = col_names stats['normal'] = np.where(stats['pval'] > alpha, True, False) else: # Long-format stats = pd.DataFrame([]) assert group in data.columns assert dv in data.columns grp = data.groupby(group, observed=True, sort=False) cols = grp.groups.keys() for _, tmp in grp: stats = stats.append( normality(tmp[dv].to_numpy(), method=method, alpha=alpha)) stats.index = cols return _postprocess_dataframe(stats)
def box_m(data, dvs, group, alpha=.001): """Test equality of covariance matrices using the Box's M test. Parameters ---------- data : :py:class:`pandas.DataFrame` Long-format dataframe. dvs : list Dependent variables. group : str Grouping variable. alpha : float Significance level. Default is 0.001 as recommended in [2]_. A non-significant p-value (higher than alpha) indicates that the covariance matrices are homogenous (= equal). Returns ------- stats : :py:class:`pandas.DataFrame` * ``'Chi2'``: Test statistic * ``'pval'``: p-value * ``'df'``: The Chi-Square statistic's degree of freedom * ``'equal_cov'``: True if ``data`` has equal covariance Notes ----- .. warning:: Box's M test is susceptible to errors if the data does not meet the assumption of multivariate normality or if the sample size is too large or small [3]_. Pingouin uses :py:meth:`pandas.DataFrameGroupBy.cov` to calculate the variance-covariance matrix of each group. Missing values are automatically excluded from the calculation by Pandas. Mathematical expressions can be found in [1]_. This function has been tested against the boxM package of the `biotools` R package [4]_. References ---------- .. [1] Rencher, A. C. (2003). Methods of multivariate analysis (Vol. 492). John Wiley & Sons. .. [2] Hahs-Vaughn, D. (2016). Applied Multivariate Statistical Concepts. Taylor & Francis. .. [3] https://en.wikipedia.org/wiki/Box%27s_M_test .. [4] https://cran.r-project.org/web/packages/biotools/index.html Examples -------- 1. Box M test with 3 dependent variables of 4 groups (equal sample size) >>> import pandas as pd >>> import pingouin as pg >>> from scipy.stats import multivariate_normal as mvn >>> data = pd.DataFrame(mvn.rvs(size=(100, 3), random_state=42), ... columns=['A', 'B', 'C']) >>> data['group'] = [1] * 25 + [2] * 25 + [3] * 25 + [4] * 25 >>> data.head() A B C group 0 0.496714 -0.138264 0.647689 1 1 1.523030 -0.234153 -0.234137 1 2 1.579213 0.767435 -0.469474 1 3 0.542560 -0.463418 -0.465730 1 4 0.241962 -1.913280 -1.724918 1 >>> pg.box_m(data, dvs=['A', 'B', 'C'], group='group') Chi2 df pval equal_cov box 11.634185 18.0 0.865537 True 2. Box M test with 3 dependent variables of 2 groups (unequal sample size) >>> data = pd.DataFrame(mvn.rvs(size=(30, 2), random_state=42), ... columns=['A', 'B']) >>> data['group'] = [1] * 20 + [2] * 10 >>> pg.box_m(data, dvs=['A', 'B'], group='group') Chi2 df pval equal_cov box 0.706709 3.0 0.871625 True """ # Safety checks from scipy.stats import chi2 assert isinstance(data, pd.DataFrame), "data must be a pandas dataframe." assert group in data.columns, "The grouping variable is not in data." assert set(dvs).issubset(data.columns), "The DVs are not in data." grp = data.groupby(group, observed=True)[dvs] assert grp.ngroups > 1, 'Data must have at least two columns.' # Calculate covariance matrix and descriptive statistics # - n_covs is the number of covariance matrices # - n_dvs is the number of variables # - n_samp is the number of samples in each covariance matrix # - nobs is the total number of observations covs = grp.cov() n_covs, n_dvs = covs.index.levshape n_samp = grp.count().iloc[:, 0].to_numpy() # NaN are excluded by .count nobs = n_samp.sum() v = n_samp - 1 # Calculate pooled covariance matrix (S) and M statistics covs = covs.to_numpy().reshape(n_covs, n_dvs, -1) S = (covs * v[..., None, None]).sum(axis=0) / (nobs - n_covs) # The following lines might raise an error if the covariance matrices are # not invertible (e.g. missing values in input). S_det = np.linalg.det(S) M = ((np.linalg.det(covs) / S_det)**(v / 2)).prod() # Calculate C in reference [1] (page 257-259) if len(np.unique(n_samp)) == 1: # All groups have same number of samples c = ((n_covs + 1) * (2 * n_dvs ** 2 + 3 * n_dvs - 1)) \ / (6 * n_covs * (n_dvs + 1) * (nobs / n_covs - 1)) else: # Unequal sample size c = (2 * n_dvs**2 + 3 * n_dvs - 1) / (6 * (n_dvs + 1) * (n_covs - 1)) c *= ((1 / v).sum() - 1 / v.sum()) # Calculate U statistics and degree of fredom u = -2 * (1 - c) * np.log(M) df = 0.5 * n_dvs * (n_dvs + 1) * (n_covs - 1) p = chi2.sf(u, df) equal_cov = True if p > alpha else False stats = pd.DataFrame(index=["box"], data={ 'Chi2': [u], 'df': [df], 'pval': [p], 'equal_cov': [equal_cov] }) return _postprocess_dataframe(stats)
def homoscedasticity(data, dv=None, group=None, method="levene", alpha=.05): """Test equality of variance. Parameters ---------- data : :py:class:`pandas.DataFrame`, list or dict Iterable. Can be either a list / dictionnary of iterables or a wide- or long-format pandas dataframe. dv : str Dependent variable (only when ``data`` is a long-format dataframe). group : str Grouping variable (only when ``data`` is a long-format dataframe). method : str Statistical test. `'levene'` (default) performs the Levene test using :py:func:`scipy.stats.levene`, and `'bartlett'` performs the Bartlett test using :py:func:`scipy.stats.bartlett`. The former is more robust to departure from normality. alpha : float Significance level. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'W/T'``: Test statistic ('W' for Levene, 'T' for Bartlett) * ``'pval'``: p-value * ``'equal_var'``: True if ``data`` has equal variance See Also -------- normality : Univariate normality test. sphericity : Mauchly's test for sphericity. Notes ----- The **Bartlett** :math:`T` statistic [1]_ is defined as: .. math:: T = \\frac{(N-k) \\ln{s^{2}_{p}} - \\sum_{i=1}^{k}(N_{i} - 1) \\ln{s^{2}_{i}}}{1 + (1/(3(k-1)))((\\sum_{i=1}^{k}{1/(N_{i} - 1))} - 1/(N-k))} where :math:`s_i^2` is the variance of the :math:`i^{th}` group, :math:`N` is the total sample size, :math:`N_i` is the sample size of the :math:`i^{th}` group, :math:`k` is the number of groups, and :math:`s_p^2` is the pooled variance. The pooled variance is a weighted average of the group variances and is defined as: .. math:: s^{2}_{p} = \\sum_{i=1}^{k}(N_{i} - 1)s^{2}_{i}/(N-k) The p-value is then computed using a chi-square distribution: .. math:: T \\sim \\chi^2(k-1) The **Levene** :math:`W` statistic [2]_ is defined as: .. math:: W = \\frac{(N-k)} {(k-1)} \\frac{\\sum_{i=1}^{k}N_{i}(\\overline{Z}_{i.}-\\overline{Z})^{2} } {\\sum_{i=1}^{k}\\sum_{j=1}^{N_i}(Z_{ij}-\\overline{Z}_{i.})^{2} } where :math:`Z_{ij} = |Y_{ij} - \\text{median}({Y}_{i.})|`, :math:`\\overline{Z}_{i.}` are the group means of :math:`Z_{ij}` and :math:`\\overline{Z}` is the grand mean of :math:`Z_{ij}`. The p-value is then computed using a F-distribution: .. math:: W \\sim F(k-1, N-k) .. warning:: Missing values are not supported for this function. Make sure to remove them before using the :py:meth:`pandas.DataFrame.dropna` or :py:func:`pingouin.remove_na` functions. References ---------- .. [1] Bartlett, M. S. (1937). Properties of sufficiency and statistical tests. Proc. R. Soc. Lond. A, 160(901), 268-282. .. [2] Brown, M. B., & Forsythe, A. B. (1974). Robust tests for the equality of variances. Journal of the American Statistical Association, 69(346), 364-367. Examples -------- 1. Levene test on a wide-format dataframe >>> import numpy as np >>> import pingouin as pg >>> data = pg.read_dataset('mediation') >>> pg.homoscedasticity(data[['X', 'Y', 'M']]) W pval equal_var levene 0.434861 0.999997 True 2. Bartlett test using a list of iterables >>> data = [[4, 8, 9, 20, 14], np.array([5, 8, 15, 45, 12])] >>> pg.homoscedasticity(data, method="bartlett", alpha=.05) T pval equal_var bartlett 2.873569 0.090045 True 3. Long-format dataframe >>> data = pg.read_dataset('rm_anova2') >>> pg.homoscedasticity(data, dv='Performance', group='Time') W pval equal_var levene 3.192197 0.079217 True """ assert isinstance(data, (pd.DataFrame, list, dict)) assert method.lower() in ['levene', 'bartlett'] func = getattr(scipy.stats, method) if isinstance(data, pd.DataFrame): # Data is a Pandas DataFrame if dv is None and group is None: # Wide-format # Get numeric data only numdata = data._get_numeric_data() assert numdata.shape[1] > 1, 'Data must have at least two columns.' statistic, p = func(*numdata.to_numpy()) else: # Long-format assert group in data.columns assert dv in data.columns grp = data.groupby(group, observed=True)[dv] assert grp.ngroups > 1, 'Data must have at least two columns.' statistic, p = func(*grp.apply(list)) elif isinstance(data, list): # Check that list contains other list or np.ndarray assert all(isinstance(el, (list, np.ndarray)) for el in data) assert len(data) > 1, 'Data must have at least two iterables.' statistic, p = func(*data) else: # Data is a dict assert all(isinstance(el, (list, np.ndarray)) for el in data.values()) assert len(data) > 1, 'Data must have at least two iterables.' statistic, p = func(*data.values()) equal_var = True if p > alpha else False stat_name = 'W' if method.lower() == 'levene' else 'T' stats = pd.DataFrame( { stat_name: statistic, 'pval': p, 'equal_var': equal_var }, index=[method]) return _postprocess_dataframe(stats)
def mediation_analysis(data=None, x=None, m=None, y=None, covar=None, alpha=0.05, n_boot=500, seed=None, return_dist=False): """Mediation analysis using a bias-correct non-parametric bootstrap method. Parameters ---------- data : :py:class:`pandas.DataFrame` Dataframe. x : str Column name in data containing the predictor variable. The predictor variable must be continuous. m : str or list of str Column name(s) in data containing the mediator variable(s). The mediator(s) can be continuous or binary (e.g. 0 or 1). This function supports multiple parallel mediators. y : str Column name in data containing the outcome variable. The outcome variable must be continuous. covar : None, str, or list Covariate(s). If not None, the specified covariate(s) will be included in all regressions. alpha : float Significance threshold. Used to determine the confidence interval, :math:`\\text{CI} = [\\alpha / 2 ; 1 - \\alpha / 2]`. n_boot : int Number of bootstrap iterations for confidence intervals and p-values estimation. The greater, the slower. seed : int or None Random state seed. return_dist : bool If True, the function also returns the indirect bootstrapped beta samples (size = n_boot). Can be plotted for instance using :py:func:`seaborn.distplot()` or :py:func:`seaborn.kdeplot()` functions. Returns ------- stats : :py:class:`pandas.DataFrame` Mediation summary: * ``'path'``: regression model * ``'coef'``: regression estimates * ``'se'``: standard error * ``'CI[2.5%]'``: lower confidence interval * ``'CI[97.5%]'``: upper confidence interval * ``'pval'``: two-sided p-values * ``'sig'``: statistical significance See also -------- linear_regression, logistic_regression Notes ----- Mediation analysis [1]_ is a *"statistical procedure to test whether the effect of an independent variable X on a dependent variable Y (i.e., X → Y) is at least partly explained by a chain of effects of the independent variable on an intervening mediator variable M and of the intervening variable on the dependent variable (i.e., X → M → Y)"* [2]_. The **indirect effect** (also referred to as average causal mediation effect or ACME) of X on Y through mediator M quantifies the estimated difference in Y resulting from a one-unit change in X through a sequence of causal steps in which X affects M, which in turn affects Y. It is considered significant if the specified confidence interval does not include 0. The path 'X --> Y' is the sum of both the indirect and direct effect. It is sometimes referred to as total effect. A linear regression is used if the mediator variable is continuous and a logistic regression if the mediator variable is dichotomous (binary). Multiple parallel mediators are also supported. This function wll only work well if the outcome variable is continuous. It does not support binary or ordinal outcome variable. For more advanced mediation models, please refer to the `lavaan <http://lavaan.ugent.be/tutorial/mediation.html>`_ or `mediation <https://cran.r-project.org/web/packages/mediation/mediation.pdf>`_ R packages, or the `PROCESS macro <https://www.processmacro.org/index.html>`_ for SPSS. The two-sided p-value of the indirect effect is computed using the bootstrap distribution, as in the mediation R package. However, the p-value should be interpreted with caution since it is not constructed conditioned on a true null hypothesis [3]_ and varies depending on the number of bootstrap samples and the random seed. Note that rows with missing values are automatically removed. Results have been tested against the R mediation package and this tutorial https://data.library.virginia.edu/introduction-to-mediation-analysis/ References ---------- .. [1] Baron, R. M. & Kenny, D. A. The moderator–mediator variable distinction in social psychological research: Conceptual, strategic, and statistical considerations. J. Pers. Soc. Psychol. 51, 1173–1182 (1986). .. [2] Fiedler, K., Schott, M. & Meiser, T. What mediation analysis can (not) do. J. Exp. Soc. Psychol. 47, 1231–1236 (2011). .. [3] Hayes, A. F. & Rockwood, N. J. Regression-based statistical mediation and moderation analysis in clinical research: Observations, recommendations, and implementation. Behav. Res. Ther. 98, 39–57 (2017). Code originally adapted from https://github.com/rmill040/pymediation. Examples -------- 1. Simple mediation analysis >>> from pingouin import mediation_analysis, read_dataset >>> df = read_dataset('mediation') >>> mediation_analysis(data=df, x='X', m='M', y='Y', alpha=0.05, ... seed=42) path coef se pval CI[2.5%] CI[97.5%] sig 0 M ~ X 0.561015 0.094480 4.391362e-08 0.373522 0.748509 Yes 1 Y ~ M 0.654173 0.085831 1.612674e-11 0.483844 0.824501 Yes 2 Total 0.396126 0.111160 5.671128e-04 0.175533 0.616719 Yes 3 Direct 0.039604 0.109648 7.187429e-01 -0.178018 0.257226 No 4 Indirect 0.356522 0.083313 0.000000e+00 0.219818 0.537654 Yes 2. Return the indirect bootstrapped beta coefficients >>> stats, dist = mediation_analysis(data=df, x='X', m='M', y='Y', ... return_dist=True) >>> print(dist.shape) (500,) 3. Mediation analysis with a binary mediator variable >>> mediation_analysis(data=df, x='X', m='Mbin', y='Y', seed=42).round(3) path coef se pval CI[2.5%] CI[97.5%] sig 0 Mbin ~ X -0.021 0.116 0.857 -0.248 0.206 No 1 Y ~ Mbin -0.135 0.412 0.743 -0.952 0.682 No 2 Total 0.396 0.111 0.001 0.176 0.617 Yes 3 Direct 0.396 0.112 0.001 0.174 0.617 Yes 4 Indirect 0.002 0.050 0.960 -0.072 0.146 No 4. Mediation analysis with covariates >>> mediation_analysis(data=df, x='X', m='M', y='Y', ... covar=['Mbin', 'Ybin'], seed=42).round(3) path coef se pval CI[2.5%] CI[97.5%] sig 0 M ~ X 0.559 0.097 0.000 0.367 0.752 Yes 1 Y ~ M 0.666 0.086 0.000 0.495 0.837 Yes 2 Total 0.420 0.113 0.000 0.196 0.645 Yes 3 Direct 0.064 0.110 0.561 -0.155 0.284 No 4 Indirect 0.356 0.086 0.000 0.209 0.553 Yes 5. Mediation analysis with multiple parallel mediators >>> mediation_analysis(data=df, x='X', m=['M', 'Mbin'], y='Y', ... seed=42).round(3) path coef se pval CI[2.5%] CI[97.5%] sig 0 M ~ X 0.561 0.094 0.000 0.374 0.749 Yes 1 Mbin ~ X -0.005 0.029 0.859 -0.063 0.052 No 2 Y ~ M 0.654 0.086 0.000 0.482 0.825 Yes 3 Y ~ Mbin -0.064 0.328 0.846 -0.715 0.587 No 4 Total 0.396 0.111 0.001 0.176 0.617 Yes 5 Direct 0.040 0.110 0.721 -0.179 0.258 No 6 Indirect M 0.356 0.085 0.000 0.215 0.538 Yes 7 Indirect Mbin 0.000 0.010 0.952 -0.017 0.025 No """ # Sanity check assert isinstance(x, str), 'y must be a string.' assert isinstance(y, str), 'y must be a string.' assert isinstance(m, (list, str)), 'Mediator(s) must be a list or string.' assert isinstance(covar, (type(None), str, list)) if isinstance(m, str): m = [m] n_mediator = len(m) assert isinstance(data, pd.DataFrame), 'Data must be a DataFrame.' # Check for duplicates assert n_mediator == len(set(m)), 'Cannot have duplicates mediators.' if isinstance(covar, str): covar = [covar] if isinstance(covar, list): assert len(covar) == len(set(covar)), 'Cannot have duplicates covar.' assert set(m).isdisjoint(covar), 'Mediator cannot be in covar.' # Check that columns are in dataframe columns = _fl([x, m, y, covar]) keys = data.columns assert all([c in keys for c in columns]), 'Column(s) are not in DataFrame.' # Check that columns are numeric err_msg = "Columns must be numeric or boolean." assert all([data[c].dtype.kind in 'bfiu' for c in columns]), err_msg # Drop rows with NAN Values data = data[columns].dropna() n = data.shape[0] assert n > 5, 'DataFrame must have at least 5 samples (rows).' # Check if mediator is binary mtype = 'logistic' if all(data[m].nunique() == 2) else 'linear' # Name of CI ll_name = 'CI[%.1f%%]' % (100 * alpha / 2) ul_name = 'CI[%.1f%%]' % (100 * (1 - alpha / 2)) # Compute regressions cols = ['names', 'coef', 'se', 'pval', ll_name, ul_name] # For speed, we pass np.array instead of pandas DataFrame X_val = data[_fl([x, covar])].to_numpy() # X + covar as predictors XM_val = data[_fl([x, m, covar])].to_numpy() # X + M + covar as predictors M_val = data[m].to_numpy() # M as target (no covariates) y_val = data[y].to_numpy() # y as target (no covariates) # For max precision, make sure rounding is disabled old_options = options.copy() options['round'] = None # M(j) ~ X + covar sxm = {} for idx, j in enumerate(m): if mtype == 'linear': sxm[j] = linear_regression(X_val, M_val[:, idx], alpha=alpha).loc[[1], cols] else: sxm[j] = logistic_regression(X_val, M_val[:, idx], alpha=alpha).loc[[1], cols] sxm[j].at[1, 'names'] = '%s ~ X' % j sxm = pd.concat(sxm, ignore_index=True) # Y ~ M + covar smy = linear_regression(data[_fl([m, covar])], y_val, alpha=alpha).loc[1:n_mediator, cols] # Average Total Effects (Y ~ X + covar) sxy = linear_regression(X_val, y_val, alpha=alpha).loc[[1], cols] # Average Direct Effects (Y ~ X + M + covar) direct = linear_regression(XM_val, y_val, alpha=alpha).loc[[1], cols] # Rename paths smy['names'] = smy['names'].apply(lambda x: 'Y ~ %s' % x) direct.at[1, 'names'] = 'Direct' sxy.at[1, 'names'] = 'Total' # Concatenate and create sig column stats = pd.concat((sxm, smy, sxy, direct), ignore_index=True) stats['sig'] = np.where(stats['pval'] < alpha, 'Yes', 'No') # Bootstrap confidence intervals rng = np.random.RandomState(seed) idx = rng.choice(np.arange(n), replace=True, size=(n_boot, n)) ab_estimates = np.zeros(shape=(n_boot, n_mediator)) for i in range(n_boot): ab_estimates[i, :] = _point_estimate(X_val, XM_val, M_val, y_val, idx[i, :], n_mediator, mtype) ab = _point_estimate(X_val, XM_val, M_val, y_val, np.arange(n), n_mediator, mtype) indirect = {'names': m, 'coef': ab, 'se': ab_estimates.std(ddof=1, axis=0), 'pval': [], ll_name: [], ul_name: [], 'sig': []} for j in range(n_mediator): ci_j = _bca(ab_estimates[:, j], indirect['coef'][j], alpha=alpha, n_boot=n_boot) indirect[ll_name].append(min(ci_j)) indirect[ul_name].append(max(ci_j)) # Bootstrapped p-value of indirect effect # Note that this is less accurate than a permutation test because the # bootstrap distribution is not conditioned on a true null hypothesis. # For more details see Hayes and Rockwood 2017 indirect['pval'].append(_pval_from_bootci(ab_estimates[:, j], indirect['coef'][j])) indirect['sig'].append('Yes' if indirect['pval'][j] < alpha else 'No') # Create output dataframe indirect = pd.DataFrame.from_dict(indirect) if n_mediator == 1: indirect['names'] = 'Indirect' else: indirect['names'] = indirect['names'].apply(lambda x: 'Indirect %s' % x) stats = stats.append(indirect, ignore_index=True) stats = stats.rename(columns={'names': 'path'}) # Restore options options.update(old_options) if return_dist: return _postprocess_dataframe(stats), np.squeeze(ab_estimates) else: return _postprocess_dataframe(stats)
def logistic_regression(X, y, coef_only=False, alpha=0.05, as_dataframe=True, remove_na=False, **kwargs): """(Multiple) Binary logistic regression. Parameters ---------- X : array_like Predictor(s), of shape *(n_samples, n_features)* or *(n_samples)*. y : array_like Dependent variable, of shape *(n_samples)*. ``y`` must be binary, i.e. only contains 0 or 1. Multinomial logistic regression is not supported. coef_only : bool If True, return only the regression coefficients. alpha : float Alpha value used for the confidence intervals. :math:`\\text{CI} = [\\alpha / 2 ; 1 - \\alpha / 2]` as_dataframe : bool If True, returns a pandas DataFrame. If False, returns a dictionnary. remove_na : bool If True, apply a listwise deletion of missing values (i.e. the entire row is removed). Default is False, which will raise an error if missing values are present in either the predictor(s) or dependent variable. **kwargs : optional Optional arguments passed to :py:class:`sklearn.linear_model.LogisticRegression` (see Notes). Returns ------- stats : :py:class:`pandas.DataFrame` or dict Logistic regression summary: * ``'names'``: name of variable(s) in the model (e.g. x1, x2...) * ``'coef'``: regression coefficients (log-odds) * ``'se'``: standard error * ``'z'``: z-scores * ``'pval'``: two-tailed p-values * ``'CI[2.5%]'``: lower confidence interval * ``'CI[97.5%]'``: upper confidence interval See also -------- linear_regression Notes ----- .. caution:: This function is a wrapper around the :py:class:`sklearn.linear_model.LogisticRegression` class. However, Pingouin internally disables the L2 regularization and changes the default solver in order to get results that are similar to R and statsmodels. The logistic regression assumes that the log-odds (the logarithm of the odds) for the value labeled "1" in the response variable is a linear combination of the predictor variables. The log-odds are given by the `logit <https://en.wikipedia.org/wiki/Logit>`_ function, which map a probability :math:`p` of the response variable being "1" from :math:`[0, 1)` to :math:`(-\\infty, +\\infty)`. .. math:: \\text{logit}(p) = \\ln \\frac{p}{1 - p} = \\beta_0 + \\beta X The odds of the response variable being "1" can be obtained by exponentiating the log-odds: .. math:: \\frac{p}{1 - p} = e^{\\beta_0 + \\beta X} and the probability of the response variable being "1" is given by the `logistic function <https://en.wikipedia.org/wiki/Logistic_function>`_: .. math:: p = \\frac{1}{1 + e^{-(\\beta_0 + \\beta X})} The first coefficient is always the constant term (intercept) of the model. Pingouin will automatically add the intercept to your predictor(s) matrix, therefore, :math:`X` should not include a constant term. Pingouin will remove any constant term (e.g column with only one unique value), or duplicate columns from :math:`X`. The calculation of the p-values and confidence interval is adapted from a `code by Rob Speare <https://gist.github.com/rspeare/77061e6e317896be29c6de9a85db301d>`_. Results have been compared against statsmodels, R, and JASP. Examples -------- 1. Simple binary logistic regression. In this first example, we'll use the `penguins dataset <https://github.com/allisonhorst/palmerpenguins>`_ to see how well we can predict the sex of penguins based on their bodies mass. >>> import numpy as np >>> import pandas as pd >>> import pingouin as pg >>> df = pg.read_dataset('penguins') >>> # Let's first convert the target variable from string to boolean: >>> df['male'] = (df['sex'] == 'male').astype(int) # male: 1, female: 0 >>> # Since there are missing values in our outcome variable, we need to >>> # set `remove_na=True` otherwise regression will fail. >>> lom = pg.logistic_regression(df['body_mass_g'], df['male'], ... remove_na=True) >>> lom.round(2) names coef se z pval CI[2.5%] CI[97.5%] 0 Intercept -5.16 0.71 -7.24 0.0 -6.56 -3.77 1 body_mass_g 0.00 0.00 7.24 0.0 0.00 0.00 Body mass is a significant predictor of sex (p<0.001). Here, it could be useful to rescale our predictor variable from *g* to *kg* (e.g divide by 1000) in order to get more intuitive coefficients and confidence intervals: >>> df['body_mass_kg'] = df['body_mass_g'] / 1000 >>> lom = pg.logistic_regression(df['body_mass_kg'], df['male'], ... remove_na=True) >>> lom.round(2) names coef se z pval CI[2.5%] CI[97.5%] 0 Intercept -5.16 0.71 -7.24 0.0 -6.56 -3.77 1 body_mass_kg 1.23 0.17 7.24 0.0 0.89 1.56 2. Multiple binary logistic regression We'll now add the species as a categorical predictor in our model. To do so, we first need to dummy-code our categorical variable, dropping the first level of our categorical variable (species = Adelie) which will be used as the reference level: >>> df = pd.get_dummies(df, columns=['species'], drop_first=True) >>> X = df[['body_mass_kg', 'species_Chinstrap', 'species_Gentoo']] >>> y = df['male'] >>> lom = pg.logistic_regression(X, y, remove_na=True) >>> lom.round(2) names coef se z pval CI[2.5%] CI[97.5%] 0 Intercept -26.24 2.84 -9.24 0.00 -31.81 -20.67 1 body_mass_kg 7.10 0.77 9.23 0.00 5.59 8.61 2 species_Chinstrap -0.13 0.42 -0.31 0.75 -0.96 0.69 3 species_Gentoo -9.72 1.12 -8.65 0.00 -11.92 -7.52 3. Using NumPy aray and returning only the coefficients >>> pg.logistic_regression(X.to_numpy(), y.to_numpy(), coef_only=True, ... remove_na=True) array([-26.23906892, 7.09826571, -0.13180626, -9.71718529]) 4. Passing custom parameters to sklearn >>> lom = pg.logistic_regression(X, y, solver='sag', max_iter=10000, ... random_state=42, remove_na=True) >>> print(lom['coef'].to_numpy()) [-25.98248153 7.02881472 -0.13119779 -9.62247569] **How to interpret the log-odds coefficients?** We'll use the `Wikipedia example <https://en.wikipedia.org/wiki/Logistic_regression#Probability_of_passing_an_exam_versus_hours_of_study>`_ of the probability of passing an exam versus the hours of study: *A group of 20 students spends between 0 and 6 hours studying for an exam. How does the number of hours spent studying affect the probability of the student passing the exam?* >>> # First, let's create the dataframe >>> Hours = [0.50, 0.75, 1.00, 1.25, 1.50, 1.75, 1.75, 2.00, 2.25, 2.50, ... 2.75, 3.00, 3.25, 3.50, 4.00, 4.25, 4.50, 4.75, 5.00, 5.50] >>> Pass = [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1] >>> df = pd.DataFrame({'HoursStudy': Hours, 'PassExam': Pass}) >>> # And then run the logistic regression >>> lr = pg.logistic_regression(df['HoursStudy'], df['PassExam']).round(3) >>> lr names coef se z pval CI[2.5%] CI[97.5%] 0 Intercept -4.078 1.761 -2.316 0.021 -7.529 -0.626 1 HoursStudy 1.505 0.629 2.393 0.017 0.272 2.737 The ``Intercept`` coefficient (-4.078) is the log-odds of ``PassExam=1`` when ``HoursStudy=0``. The odds ratio can be obtained by exponentiating the log-odds: >>> np.exp(-4.078) 0.016941314421496552 i.e. :math:`0.017:1`. Conversely the odds of failing the exam are :math:`(1/0.017) \\approx 59:1`. The probability can then be obtained with the following equation .. math:: p = \\frac{1}{1 + e^{-(-4.078 + 0 * 1.505)}} >>> 1 / (1 + np.exp(-(-4.078))) 0.016659087580814722 The ``HoursStudy`` coefficient (1.505) means that for each additional hour of study, the log-odds of passing the exam increase by 1.505, and the odds are multipled by :math:`e^{1.505} \\approx 4.50`. For example, a student who studies 2 hours has a probability of passing the exam of 25%: >>> 1 / (1 + np.exp(-(-4.078 + 2 * 1.505))) 0.2557836148964987 The table below shows the probability of passing the exam for several values of ``HoursStudy``: +----------------+----------+----------------+------------------+ | Hours of Study | Log-odds | Odds | Probability | +================+==========+================+==================+ | 0 | −4.08 | 0.017 ≈ 1:59 | 0.017 | +----------------+----------+----------------+------------------+ | 1 | −2.57 | 0.076 ≈ 1:13 | 0.07 | +----------------+----------+----------------+------------------+ | 2 | −1.07 | 0.34 ≈ 1:3 | 0.26 | +----------------+----------+----------------+------------------+ | 3 | 0.44 | 1.55 | 0.61 | +----------------+----------+----------------+------------------+ | 4 | 1.94 | 6.96 | 0.87 | +----------------+----------+----------------+------------------+ | 5 | 3.45 | 31.4 | 0.97 | +----------------+----------+----------------+------------------+ | 6 | 4.96 | 141.4 | 0.99 | +----------------+----------+----------------+------------------+ """ # Check that sklearn is installed from pingouin.utils import _is_sklearn_installed _is_sklearn_installed(raise_error=True) from sklearn.linear_model import LogisticRegression # Extract names if X is a Dataframe or Series if isinstance(X, pd.DataFrame): names = X.keys().tolist() elif isinstance(X, pd.Series): names = [X.name] else: names = [] # Convert to numpy array X = np.asarray(X) y = np.asarray(y) assert y.ndim == 1, 'y must be one-dimensional.' assert 0 < alpha < 1, 'alpha must be between 0 and 1.' # Add axis if only one-dimensional array if X.ndim == 1: X = X[..., np.newaxis] # Check for NaN / Inf if remove_na: X, y = rm_na(X, y[..., np.newaxis], paired=True, axis='rows') y = np.squeeze(y) y_gd = np.isfinite(y).all() X_gd = np.isfinite(X).all() assert y_gd, ("Target (y) contains NaN or Inf. Please remove them " "manually or use remove_na=True.") assert X_gd, ("Predictors (X) contain NaN or Inf. Please remove them " "manually or use remove_na=True.") # Check that X and y have same length assert y.shape[0] == X.shape[0], 'X and y must have same number of samples' # Check that y is binary if np.unique(y).size != 2: raise ValueError('Dependent variable must be binary.') if not names: names = ['x' + str(i + 1) for i in range(X.shape[1])] # We also want to make sure that there is no column # with only one unique value, otherwise the regression fails # This is equivalent, but much faster, to pd.DataFrame(X).nunique() idx_unique = np.where(np.all(X == X[0, :], axis=0))[0] if len(idx_unique): X = np.delete(X, idx_unique, 1) names = np.delete(names, idx_unique).tolist() # Finally, we want to remove duplicate columns if X.shape[1] > 1: idx_duplicate = [] for pair in itertools.combinations(range(X.shape[1]), 2): if np.array_equal(X[:, pair[0]], X[:, pair[1]]): idx_duplicate.append(pair[1]) if len(idx_duplicate): X = np.delete(X, idx_duplicate, 1) names = np.delete(names, idx_duplicate).tolist() # Initialize and fit if 'solver' not in kwargs: # https://stats.stackexchange.com/a/204324/253579 # Updated in Pingouin > 0.3.6 to be consistent with R kwargs['solver'] = 'newton-cg' if 'penalty' not in kwargs: kwargs['penalty'] = 'none' lom = LogisticRegression(**kwargs) lom.fit(X, y) if lom.get_params()['fit_intercept']: names.insert(0, "Intercept") X_design = np.column_stack((np.ones(X.shape[0]), X)) coef = np.append(lom.intercept_, lom.coef_) else: coef = lom.coef_ X_design = X if coef_only: return coef # Fisher Information Matrix n, p = X_design.shape denom = (2 * (1 + np.cosh(lom.decision_function(X)))) denom = np.tile(denom, (p, 1)).T fim = (X_design / denom).T @ X_design crao = np.linalg.pinv(fim) # Standard error and Z-scores se = np.sqrt(np.diag(crao)) z_scores = coef / se # Two-tailed p-values pval = 2 * norm.sf(np.fabs(z_scores)) # Wald Confidence intervals # In R: this is equivalent to confint.default(model) # Note that confint(model) will however return the profile CI crit = norm.ppf(1 - alpha / 2) ll = coef - crit * se ul = coef + crit * se # Rename CI ll_name = 'CI[%.1f%%]' % (100 * alpha / 2) ul_name = 'CI[%.1f%%]' % (100 * (1 - alpha / 2)) # Create dict stats = {'names': names, 'coef': coef, 'se': se, 'z': z_scores, 'pval': pval, ll_name: ll, ul_name: ul} if as_dataframe: return _postprocess_dataframe(pd.DataFrame(stats)) else: return stats
def linear_regression(X, y, add_intercept=True, weights=None, coef_only=False, alpha=0.05, as_dataframe=True, remove_na=False, relimp=False): """(Multiple) Linear regression. Parameters ---------- X : array_like Predictor(s), of shape *(n_samples, n_features)* or *(n_samples)*. y : array_like Dependent variable, of shape *(n_samples)*. add_intercept : bool If False, assume that the data are already centered. If True, add a constant term to the model. In this case, the first value in the output dict is the intercept of the model. .. note:: It is generally recommended to include a constant term (intercept) to the model to limit the bias and force the residual mean to equal zero. The intercept coefficient and p-values are however rarely meaningful. weights : array_like An optional vector of sample weights to be used in the fitting process, of shape *(n_samples)*. Missing or negative weights are not allowed. If not null, a weighted least squares is calculated. .. versionadded:: 0.3.5 coef_only : bool If True, return only the regression coefficients. alpha : float Alpha value used for the confidence intervals. :math:`\\text{CI} = [\\alpha / 2 ; 1 - \\alpha / 2]` as_dataframe : bool If True, returns a pandas DataFrame. If False, returns a dictionnary. remove_na : bool If True, apply a listwise deletion of missing values (i.e. the entire row is removed). Default is False, which will raise an error if missing values are present in either the predictor(s) or dependent variable. relimp : bool If True, returns the relative importance (= contribution) of predictors. This is irrelevant when the predictors are uncorrelated: the total :math:`R^2` of the model is simply the sum of each univariate regression :math:`R^2`-values. However, this does not apply when predictors are correlated. Instead, the total :math:`R^2` of the model is partitioned by averaging over all combinations of predictors, as done in the `relaimpo <https://cran.r-project.org/web/packages/relaimpo/relaimpo.pdf>`_ R package (``calc.relimp(type="lmg")``). .. warning:: The computation time roughly doubles for each additional predictor and therefore this can be extremely slow for models with more than 12-15 predictors. .. versionadded:: 0.3.0 Returns ------- stats : :py:class:`pandas.DataFrame` or dict Linear regression summary: * ``'names'``: name of variable(s) in the model (e.g. x1, x2...) * ``'coef'``: regression coefficients * ``'se'``: standard errors * ``'T'``: T-values * ``'pval'``: p-values * ``'r2'``: coefficient of determination (:math:`R^2`) * ``'adj_r2'``: adjusted :math:`R^2` * ``'CI[2.5%]'``: lower confidence intervals * ``'CI[97.5%]'``: upper confidence intervals * ``'relimp'``: relative contribution of each predictor to the final\ :math:`R^2` (only if ``relimp=True``). * ``'relimp_perc'``: percent relative contribution In addition, the output dataframe comes with hidden attributes such as the residuals, and degrees of freedom of the model and residuals, which can be accessed as follow, respectively: >>> lm = pg.linear_regression() # doctest: +SKIP >>> lm.residuals_, lm.df_model_, lm.df_resid_ # doctest: +SKIP Note that to follow scikit-learn convention, these hidden atributes end with an "_". When ``as_dataframe=False`` however, these attributes are no longer hidden and can be accessed as any other keys in the output dictionary. >>> lm = pg.linear_regression() # doctest: +SKIP >>> lm['residuals'], lm['df_model'], lm['df_resid'] # doctest: +SKIP When ``as_dataframe=False`` the dictionary also contains the processed ``X`` and ``y`` arrays (i.e, with NaNs removed if ``remove_na=True``) and the model's predicted values ``pred``. >>> lm['X'], lm['y'], lm['pred'] # doctest: +SKIP For a weighted least squares fit, the weighted ``Xw`` and ``yw`` arrays are included in the dictionary. >>> lm['Xw'], lm['yw'] # doctest: +SKIP See also -------- logistic_regression, mediation_analysis, corr Notes ----- The :math:`\\beta` coefficients are estimated using an ordinary least squares (OLS) regression, as implemented in the :py:func:`scipy.linalg.lstsq` function. The OLS method minimizes the sum of squared residuals, and leads to a closed-form expression for the estimated :math:`\\beta`: .. math:: \\hat{\\beta} = (X^TX)^{-1} X^Ty It is generally recommended to include a constant term (intercept) to the model to limit the bias and force the residual mean to equal zero. Note that intercept coefficient and p-values are however rarely meaningful. The standard error of the estimates is a measure of the accuracy of the prediction defined as: .. math:: \\sigma = \\sqrt{\\text{MSE} \\cdot (X^TX)^{-1}} where :math:`\\text{MSE}` is the mean squared error, .. math:: \\text{MSE} = \\frac{SS_{\\text{resid}}}{n - p - 1} = \\frac{\\sum{(\\text{true} - \\text{pred})^2}}{n - p - 1} :math:`p` is the total number of predictor variables in the model (excluding the intercept) and :math:`n` is the sample size. Using the :math:`\\beta` coefficients and the standard errors, the T-values can be obtained: .. math:: T = \\frac{\\beta}{\\sigma} and the p-values approximated using a T-distribution with :math:`n - p - 1` degrees of freedom. The coefficient of determination (:math:`R^2`) is defined as: .. math:: R^2 = 1 - (\\frac{SS_{\\text{resid}}}{SS_{\\text{total}}}) The adjusted :math:`R^2` is defined as: .. math:: \\overline{R}^2 = 1 - (1 - R^2) \\frac{n - 1}{n - p - 1} The relative importance (``relimp``) column is a partitioning of the total :math:`R^2` of the model into individual :math:`R^2` contribution. This is calculated by taking the average over average contributions in models of different sizes. For more details, please refer to `Groemping et al. 2006 <http://dx.doi.org/10.18637/jss.v017.i01>`_ and the R package `relaimpo <https://cran.r-project.org/web/packages/relaimpo/relaimpo.pdf>`_. Note that Pingouin will automatically remove any duplicate columns from :math:`X`, as well as any column with only one unique value (constant), excluding the intercept. Results have been compared against sklearn, R, statsmodels and JASP. Examples -------- 1. Simple linear regression using columns of a pandas dataframe In this first example, we'll use the tips dataset to see how well we can predict the waiter's tip (in dollars) based on the total bill (also in dollars). >>> import numpy as np >>> import pingouin as pg >>> df = pg.read_dataset('tips') >>> # Let's predict the tip ($) based on the total bill (also in $) >>> lm = pg.linear_regression(df['total_bill'], df['tip']) >>> lm.round(2) names coef se T pval r2 adj_r2 CI[2.5%] CI[97.5%] 0 Intercept 0.92 0.16 5.76 0.0 0.46 0.45 0.61 1.23 1 total_bill 0.11 0.01 14.26 0.0 0.46 0.45 0.09 0.12 It comes as no surprise that total bill is indeed a significant predictor of the waiter's tip (T=14.26, p<0.05). The :math:`R^2` of the model is 0.46 and the adjusted :math:`R^2` is 0.45, which means that our model roughly explains ~45% of the total variance in the tip amount. 2. Multiple linear regression We can also have more than one predictor and run a multiple linear regression. Below, we add the party size as a second predictor of tip. >>> # We'll add a second predictor: the party size >>> lm = pg.linear_regression(df[['total_bill', 'size']], df['tip']) >>> lm.round(2) names coef se T pval r2 adj_r2 CI[2.5%] CI[97.5%] 0 Intercept 0.67 0.19 3.46 0.00 0.47 0.46 0.29 1.05 1 total_bill 0.09 0.01 10.17 0.00 0.47 0.46 0.07 0.11 2 size 0.19 0.09 2.26 0.02 0.47 0.46 0.02 0.36 The party size is also a significant predictor of tip (T=2.26, p=0.02). Note that adding this new predictor however only improved the :math:`R^2` of our model by ~1%. This function also works with numpy arrays: >>> X = df[['total_bill', 'size']].to_numpy() >>> y = df['tip'].to_numpy() >>> pg.linear_regression(X, y).round(2) names coef se T pval r2 adj_r2 CI[2.5%] CI[97.5%] 0 Intercept 0.67 0.19 3.46 0.00 0.47 0.46 0.29 1.05 1 x1 0.09 0.01 10.17 0.00 0.47 0.46 0.07 0.11 2 x2 0.19 0.09 2.26 0.02 0.47 0.46 0.02 0.36 3. Get the residuals >>> # For clarity, only display the first 9 values >>> np.round(lm.residuals_, 2)[:9] array([-1.62, -0.55, 0.31, 0.06, -0.11, 0.93, 0.13, -0.81, -0.49]) Using pandas, we can show a summary of the distribution of the residuals: >>> import pandas as pd >>> pd.Series(lm.residuals_).describe().round(2) count 244.00 mean -0.00 std 1.01 min -2.93 25% -0.55 50% -0.09 75% 0.51 max 4.04 dtype: float64 5. No intercept and return only the regression coefficients Sometimes it may be useful to remove the constant term from the regression, or to only return the regression coefficients without calculating the standard errors or p-values. This latter can potentially save you a lot of time if you need to calculate hundreds of regression and only care about the coefficients! >>> pg.linear_regression(X, y, add_intercept=False, coef_only=True) array([0.1007119 , 0.36209717]) 6. Return a dictionnary instead of a dataframe >>> lm_dict = pg.linear_regression(X, y, as_dataframe=False) >>> lm_dict.keys() dict_keys(['names', 'coef', 'se', 'T', 'pval', 'r2', 'adj_r2', 'CI[2.5%]', 'CI[97.5%]', 'df_model', 'df_resid', 'residuals', 'X', 'y', 'pred']) 7. Remove missing values >>> X[4, 1] = np.nan >>> y[7] = np.nan >>> pg.linear_regression(X, y, remove_na=True, coef_only=True) array([0.65749955, 0.09262059, 0.19927529]) 8. Get the relative importance of predictors >>> lm = pg.linear_regression(X, y, remove_na=True, relimp=True) >>> lm[['names', 'relimp', 'relimp_perc']] names relimp relimp_perc 0 Intercept NaN NaN 1 x1 0.342503 73.045583 2 x2 0.126386 26.954417 The ``relimp`` column is a partitioning of the total :math:`R^2` of the model into individual contribution. Therefore, it sums to the :math:`R^2` of the full model. The ``relimp_perc`` is normalized to sum to 100%. See `Groemping 2006 <https://www.jstatsoft.org/article/view/v017i01>`_ for more details. >>> lm[['relimp', 'relimp_perc']].sum() relimp 0.468889 relimp_perc 100.000000 dtype: float64 9. Weighted linear regression >>> X = [1, 2, 3, 4, 5, 6] >>> y = [10, 22, 11, 13, 13, 16] >>> w = [1, 0.1, 1, 1, 0.5, 1] # Array of weights. Must be >= 0. >>> lm = pg.linear_regression(X, y, weights=w) >>> lm.round(2) names coef se T pval r2 adj_r2 CI[2.5%] CI[97.5%] 0 Intercept 9.00 2.03 4.42 0.01 0.51 0.39 3.35 14.64 1 x1 1.04 0.50 2.06 0.11 0.51 0.39 -0.36 2.44 """ # Extract names if X is a Dataframe or Series if isinstance(X, pd.DataFrame): names = X.keys().tolist() elif isinstance(X, pd.Series): names = [X.name] else: names = [] # Convert input to numpy array X = np.asarray(X) y = np.asarray(y) assert y.ndim == 1, 'y must be one-dimensional.' assert 0 < alpha < 1 if X.ndim == 1: # Convert to (n_samples, n_features) shape X = X[..., np.newaxis] # Check for NaN / Inf if remove_na: X, y = rm_na(X, y[..., np.newaxis], paired=True, axis='rows') y = np.squeeze(y) y_gd = np.isfinite(y).all() X_gd = np.isfinite(X).all() assert y_gd, ("Target (y) contains NaN or Inf. Please remove them " "manually or use remove_na=True.") assert X_gd, ("Predictors (X) contain NaN or Inf. Please remove them " "manually or use remove_na=True.") # Check that X and y have same length assert y.shape[0] == X.shape[0], 'X and y must have same number of samples' if not names: names = ['x' + str(i + 1) for i in range(X.shape[1])] if add_intercept: # Add intercept X = np.column_stack((np.ones(X.shape[0]), X)) names.insert(0, "Intercept") # FINAL CHECKS BEFORE RUNNING LEAST SQUARES REGRESSION # 1. Let's remove column(s) with only zero, otherwise the regression fails n_nonzero = np.count_nonzero(X, axis=0) idx_zero = np.flatnonzero(n_nonzero == 0) # Find columns that are only 0 if len(idx_zero): X = np.delete(X, idx_zero, 1) names = np.delete(names, idx_zero) # 2. We also want to make sure that there is no more than one constant # column (= intercept), otherwise the regression fails # This is equivalent, but much faster, to pd.DataFrame(X).nunique() idx_unique = np.where(np.all(X == X[0, :], axis=0))[0] if len(idx_unique) > 1: # We remove all but the first "Intercept" column. X = np.delete(X, idx_unique[1:], 1) names = np.delete(names, idx_unique[1:]) # Is there a constant in our predictor matrix? Useful for dof and R^2. constant = 1 if len(idx_unique) > 0 else 0 # 3. Finally, we want to remove duplicate columns if X.shape[1] > 1: idx_duplicate = [] for pair in itertools.combinations(range(X.shape[1]), 2): if np.array_equal(X[:, pair[0]], X[:, pair[1]]): idx_duplicate.append(pair[1]) if len(idx_duplicate): X = np.delete(X, idx_duplicate, 1) names = np.delete(names, idx_duplicate) # 4. Check that we have enough samples / features n, p = X.shape[0], X.shape[1] assert n >= 3, 'At least three valid samples are required in X.' assert p >= 1, 'X must have at least one valid column.' # 5. Handle weights if weights is not None: if relimp: raise ValueError("relimp = True is not supported when using " "weights.") w = np.asarray(weights) assert w.ndim == 1, 'weights must be a 1D array.' assert w.size == n, 'weights must be of shape n_samples.' assert not np.isnan(w).any(), 'Missing weights are not accepted.' assert not (w < 0).any(), 'Negative weights are not accepted.' # Do not count weights == 0 in dof # This gives similar results as R lm() but different from statsmodels n = np.count_nonzero(w) # Rescale (whitening) wts = np.diag(np.sqrt(w)) Xw = wts @ X yw = wts @ y else: # Set all weights to one, [1, 1, 1, ...] w = np.ones(n) Xw = X yw = y # FIT (WEIGHTED) LEAST SQUARES REGRESSION USING SCIPY.LINALG.LSTST coef, ss_res, rank, _ = lstsq(Xw, yw) if coef_only: return coef # Degrees of freedom df_model = rank - constant df_resid = n - p # Calculate predicted values and (weighted) residuals pred = Xw @ coef resid = yw - pred # ss_res = (resid ** 2).sum() # Calculate total (weighted) sums of squares and R^2 ss_tot = yw @ yw ss_wtot = np.sum(w * (y - np.average(y, weights=w))**2) if constant: r2 = 1 - ss_res / ss_wtot else: r2 = 1 - ss_res / ss_tot adj_r2 = 1 - (1 - r2) * (n - constant) / df_resid # Compute mean squared error, variance and SE mse = ss_res / df_resid beta_var = mse * (np.linalg.pinv(Xw.T @ Xw).diagonal()) beta_se = np.sqrt(beta_var) # Compute T and p-values T = coef / beta_se pval = 2 * t.sf(np.fabs(T), df_resid) # Compute confidence intervals crit = t.ppf(1 - alpha / 2, df_resid) marg_error = crit * beta_se ll = coef - marg_error ul = coef + marg_error # Rename CI ll_name = 'CI[%.1f%%]' % (100 * alpha / 2) ul_name = 'CI[%.1f%%]' % (100 * (1 - alpha / 2)) # Create dict stats = {'names': names, 'coef': coef, 'se': beta_se, 'T': T, 'pval': pval, 'r2': r2, 'adj_r2': adj_r2, ll_name: ll, ul_name: ul} # Relative importance if relimp: data = pd.concat([pd.DataFrame(y, columns=['y']), pd.DataFrame(X, columns=names)], sort=False, axis=1) if 'Intercept' in names: # Intercept is the first column reli = _relimp(data.drop(columns=['Intercept']).cov()) reli['names'] = ['Intercept'] + reli['names'] reli['relimp'] = np.insert(reli['relimp'], 0, np.nan) reli['relimp_perc'] = np.insert(reli['relimp_perc'], 0, np.nan) else: reli = _relimp(data.cov()) stats.update(reli) if as_dataframe: stats = _postprocess_dataframe(pd.DataFrame(stats)) stats.df_model_ = df_model stats.df_resid_ = df_resid stats.residuals_ = 0 # Trick to avoid Pandas warning stats.residuals_ = resid # Residuals is a hidden attribute else: stats['df_model'] = df_model stats['df_resid'] = df_resid stats['residuals'] = resid stats['X'] = X stats['y'] = y stats['pred'] = pred if weights is not None: stats['yw'] = yw stats['Xw'] = Xw return stats
def intraclass_corr(data=None, targets=None, raters=None, ratings=None, nan_policy='raise'): """Intraclass correlation. Parameters ---------- data : :py:class:`pandas.DataFrame` Long-format dataframe. Data must be fully balanced. targets : string Name of column in ``data`` containing the targets. raters : string Name of column in ``data`` containing the raters. ratings : string Name of column in ``data`` containing the ratings. nan_policy : str Defines how to handle when input contains missing values (nan). `'raise'` (default) throws an error, `'omit'` performs the calculations after deleting target(s) with one or more missing values (= listwise deletion). .. versionadded:: 0.3.0 Returns ------- stats : :py:class:`pandas.DataFrame` Output dataframe: * ``'Type'``: ICC type * ``'Description'``: description of the ICC * ``'ICC'``: intraclass correlation * ``'F'``: F statistic * ``'df1'``: numerator degree of freedom * ``'df2'``: denominator degree of freedom * ``'pval'``: p-value * ``'CI95%'``: 95% confidence intervals around the ICC Notes ----- The intraclass correlation (ICC, [1]_) assesses the reliability of ratings by comparing the variability of different ratings of the same subject to the total variation across all ratings and all subjects. Shrout and Fleiss (1979) [2]_ describe six cases of reliability of ratings done by :math:`k` raters on :math:`n` targets. Pingouin returns all six cases with corresponding F and p-values, as well as 95% confidence intervals. From the documentation of the ICC function in the `psych <https://cran.r-project.org/web/packages/psych/psych.pdf>`_ R package: - **ICC1**: Each target is rated by a different rater and the raters are selected at random. This is a one-way ANOVA fixed effects model. - **ICC2**: A random sample of :math:`k` raters rate each target. The measure is one of absolute agreement in the ratings. ICC1 is sensitive to differences in means between raters and is a measure of absolute agreement. - **ICC3**: A fixed set of :math:`k` raters rate each target. There is no generalization to a larger population of raters. ICC2 and ICC3 remove mean differences between raters, but are sensitive to interactions. The difference between ICC2 and ICC3 is whether raters are seen as fixed or random effects. Then, for each of these cases, the reliability can either be estimated for a single rating or for the average of :math:`k` ratings. The 1 rating case is equivalent to the average intercorrelation, while the :math:`k` rating case is equivalent to the Spearman Brown adjusted reliability. **ICC1k**, **ICC2k**, **ICC3K** reflect the means of :math:`k` raters. This function has been tested against the ICC function of the R psych package. Note however that contrarily to the R implementation, the current implementation does not use linear mixed effect but regular ANOVA, which means that it only works with complete-case data (no missing values). References ---------- .. [1] http://www.real-statistics.com/reliability/intraclass-correlation/ .. [2] Shrout, P. E., & Fleiss, J. L. (1979). Intraclass correlations: uses in assessing rater reliability. Psychological bulletin, 86(2), 420. Examples -------- ICCs of wine quality assessed by 4 judges. >>> import pingouin as pg >>> data = pg.read_dataset('icc') >>> icc = pg.intraclass_corr(data=data, targets='Wine', raters='Judge', ... ratings='Scores').round(3) >>> icc.set_index("Type") Description ICC F df1 df2 pval CI95% Type ICC1 Single raters absolute 0.728 11.680 7 24 0.0 [0.43, 0.93] ICC2 Single random raters 0.728 11.787 7 21 0.0 [0.43, 0.93] ICC3 Single fixed raters 0.729 11.787 7 21 0.0 [0.43, 0.93] ICC1k Average raters absolute 0.914 11.680 7 24 0.0 [0.75, 0.98] ICC2k Average random raters 0.914 11.787 7 21 0.0 [0.75, 0.98] ICC3k Average fixed raters 0.915 11.787 7 21 0.0 [0.75, 0.98] """ from pingouin import anova # Safety check assert isinstance(data, pd.DataFrame), 'data must be a dataframe.' assert all([v is not None for v in [targets, raters, ratings]]) assert all([v in data.columns for v in [targets, raters, ratings]]) assert nan_policy in ['omit', 'raise'] # Convert data to wide-format data = data.pivot_table(index=targets, columns=raters, values=ratings) # Listwise deletion of missing values nan_present = data.isna().any().any() if nan_present: if nan_policy == 'omit': data = data.dropna(axis=0, how='any') else: raise ValueError("Either missing values are present in data or " "data are unbalanced. Please remove them " "manually or use nan_policy='omit'.") # Back to long-format # data_wide = data.copy() # Optional, for PCA data = data.reset_index().melt(id_vars=targets, value_name=ratings) # Check that ratings is a numeric variable assert data[ratings].dtype.kind in 'bfiu', 'Ratings must be numeric.' # Check that data are fully balanced # This behavior is ensured by the long-to-wide-to-long transformation # Unbalanced data will result in rows with missing values. # assert data.groupby(raters)[ratings].count().nunique() == 1 # Extract sizes k = data[raters].nunique() n = data[targets].nunique() # Two-way ANOVA with np.errstate(invalid='ignore'): # For max precision, make sure rounding is disabled old_options = options.copy() options['round'] = None aov = anova(data=data, dv=ratings, between=[targets, raters], ss_type=2) options.update(old_options) # restore options # Extract mean squares msb = aov.at[0, 'MS'] msw = (aov.at[1, 'SS'] + aov.at[2, 'SS']) / (aov.at[1, 'DF'] + aov.at[2, 'DF']) msj = aov.at[1, 'MS'] mse = aov.at[2, 'MS'] # Calculate ICCs icc1 = (msb - msw) / (msb + (k - 1) * msw) icc2 = (msb - mse) / (msb + (k - 1) * mse + k * (msj - mse) / n) icc3 = (msb - mse) / (msb + (k - 1) * mse) icc1k = (msb - msw) / msb icc2k = (msb - mse) / (msb + (msj - mse) / n) icc3k = (msb - mse) / msb # Calculate F, df, and p-values f1k = msb / msw df1 = n - 1 df1kd = n * (k - 1) p1k = f.sf(f1k, df1, df1kd) f2k = f3k = msb / mse df2kd = (n - 1) * (k - 1) p2k = f.sf(f2k, df1, df2kd) # Create output dataframe stats = { 'Type': ['ICC1', 'ICC2', 'ICC3', 'ICC1k', 'ICC2k', 'ICC3k'], 'Description': [ 'Single raters absolute', 'Single random raters', 'Single fixed raters', 'Average raters absolute', 'Average random raters', 'Average fixed raters' ], 'ICC': [icc1, icc2, icc3, icc1k, icc2k, icc3k], 'F': [f1k, f2k, f2k, f1k, f2k, f2k], 'df1': n - 1, 'df2': [df1kd, df2kd, df2kd, df1kd, df2kd, df2kd], 'pval': [p1k, p2k, p2k, p1k, p2k, p2k] } stats = pd.DataFrame(stats) # Calculate confidence intervals alpha = 0.05 # Case 1 and 3 f1l = f1k / f.ppf(1 - alpha / 2, df1, df1kd) f1u = f1k * f.ppf(1 - alpha / 2, df1kd, df1) l1 = (f1l - 1) / (f1l + (k - 1)) u1 = (f1u - 1) / (f1u + (k - 1)) f3l = f3k / f.ppf(1 - alpha / 2, df1, df2kd) f3u = f3k * f.ppf(1 - alpha / 2, df2kd, df1) l3 = (f3l - 1) / (f3l + (k - 1)) u3 = (f3u - 1) / (f3u + (k - 1)) # Case 2 fj = msj / mse vn = df2kd * ((k * icc2 * fj + n * (1 + (k - 1) * icc2) - k * icc2))**2 vd = df1 * k**2 * icc2**2 * fj**2 + \ (n * (1 + (k - 1) * icc2) - k * icc2)**2 v = vn / vd f2u = f.ppf(1 - alpha / 2, n - 1, v) f2l = f.ppf(1 - alpha / 2, v, n - 1) l2 = n * (msb - f2u * mse) / (f2u * (k * msj + (k * n - k - n) * mse) + n * msb) u2 = n * (f2l * msb - mse) / (k * msj + (k * n - k - n) * mse + n * f2l * msb) stats['CI95%'] = [ np.array([l1, u1]), np.array([l2, u2]), np.array([l3, u3]), np.array([1 - 1 / f1l, 1 - 1 / f1u]), np.array([l2 * k / (1 + l2 * (k - 1)), u2 * k / (1 + u2 * (k - 1))]), np.array([1 - 1 / f3l, 1 - 1 / f3u]) ] return _postprocess_dataframe(stats)