def test_compute_esci(self): """Test function compute_esci. Note that since Pingouin v0.3.5, CIs around a Cohen d are calculated using a T (and not Z) distribution. This is the same behavior as the cohen.d function of the effsize R package. However, note that the cohen.d function does not use the Cohen d-avg for paired samples, and therefore we cannot directly compare the CIs for paired samples. Similarly, R uses a slightly different formula to estimate the SE of one-sample cohen D. """ # Pearson correlation r = 0.5543563 ci = compute_esci(stat=r, nx=6, eftype='r') assert np.allclose(ci, [-0.47, 0.94]) # Cohen d # .. One sample and paired # Cannot compare to R because cohen.d uses different formulas for # Cohen d and SE. d = compute_effsize(np.r_[x, y], y=0) assert round(d, 6) == 2.086694 # Same as cohen.d ci = compute_esci(d, nx + ny, 1, decimals=6) d = compute_effsize(x, y, paired=True) ci = compute_esci(d, nx, ny, paired=True, decimals=6) # .. Independent (compare with cohen.d function) d = compute_effsize(x, y) ci = compute_esci(d, nx, ny, decimals=6) np.testing.assert_equal(ci, [-1.067645, 0.226762]) # Same but with different n d = compute_effsize(x, y[:-5]) ci = compute_esci(d, nx, len(y[:-5]), decimals=8) np.testing.assert_equal(ci, [-1.33603010, 0.08662825])
def test_compute_esci(self): """Test function compute_esci. Note that since Pingouin v0.3.5, CIs around a Cohen d are calculated using a T (and not Z) distribution. This is the same behavior as the cohen.d function of the effsize R package. However, note that the cohen.d function does not use the Cohen d-avg for paired samples, and therefore we cannot directly compare the CIs for paired samples. Similarly, R uses a slightly different formula to estimate the SE of one-sample cohen D. """ # Pearson correlation # https://github.com/SurajGupta/r-source/blob/master/src/library/stats/R/cor.test.R ci = compute_esci(stat=0.5543563, nx=6, eftype='r', decimals=6) assert np.allclose(ci, [-0.4675554, 0.9420809]) # Alternative == "greater" ci = compute_esci(stat=0.8, nx=20, eftype='r', alternative="greater", decimals=6) assert np.allclose(ci, [0.6041625, 1]) ci = compute_esci(stat=-0.2, nx=30, eftype='r', alternative="greater", decimals=6) assert np.allclose(ci, [-0.4771478, 1]) # Alternative == "less" ci = compute_esci(stat=-0.8, nx=20, eftype='r', alternative="less", decimals=6) assert np.allclose(ci, [-1, -0.6041625]) ci = compute_esci(stat=0.2, nx=30, eftype='r', alternative="less", decimals=6) assert np.allclose(ci, [-1, 0.4771478]) # Cohen d # .. One sample and paired # Cannot compare to R because cohen.d uses different formulas for # Cohen d and SE. d = compute_effsize(np.r_[x, y], y=0) assert round(d, 6) == 2.086694 # Same as cohen.d ci = compute_esci(d, nx + ny, 1, decimals=6) d = compute_effsize(x, y, paired=True) ci = compute_esci(d, nx, ny, paired=True, decimals=6) # .. Independent (compare with cohen.d function) d = compute_effsize(x, y) ci = compute_esci(d, nx, ny, decimals=6) np.testing.assert_equal(ci, [-1.067645, 0.226762]) # Same but with different n d = compute_effsize(x, y[:-5]) ci = compute_esci(d, nx, len(y[:-5]), decimals=8) np.testing.assert_equal(ci, [-1.33603010, 0.08662825])
def test_compute_esci(self): """Test function compute_esci""" compute_esci(stat=.6, nx=30, eftype='r') compute_esci(stat=.4, nx=len(x), ny=len(y), confidence=.99, decimals=4) compute_esci(stat=.6, nx=30, ny=30, eftype='cohen') # Compare with R r, nx = 0.5543563, 6 ci = compute_esci(stat=r, nx=nx, eftype='r') assert np.allclose(ci, [-0.47, 0.94]) # One sample / paired T-test ci = compute_esci(-0.57932, nx=40, ny=1) ci_p = compute_esci(-0.57932, nx=40, ny=1, paired=True) assert np.allclose(ci, ci_p) assert np.allclose(ci, [-0.91, -0.24])
def test_compute_esci(self): """Test function compute_esci""" compute_esci(stat=.6, nx=30, ny=30, eftype='r') compute_esci(stat=.4, nx=len(x), ny=len(y), confidence=.99, decimals=4) compute_esci(stat=.6, nx=30, ny=30, eftype='cohen') # Compare with R r, nx, ny = 0.5543563, 6, 6 ci = compute_esci(stat=r, nx=nx, ny=ny, eftype='r') assert np.allclose(ci, [-0.47, 0.94])
def corr(x, y, tail='two-sided', method='pearson'): """(Robust) correlation between two variables. Parameters ---------- x, y : array_like First and second set of observations. x and y must be independent. tail : string Specify whether to return 'one-sided' or 'two-sided' p-value. method : string Specify which method to use for the computation of the correlation coefficient. Available methods are :: 'pearson' : Pearson product-moment correlation 'spearman' : Spearman rank-order correlation 'kendall' : Kendall’s tau (ordinal data) 'percbend' : percentage bend correlation (robust) 'shepherd' : Shepherd's pi correlation (robust Spearman) 'skipped' : skipped correlation (robust Spearman, requires sklearn) Returns ------- stats : pandas DataFrame Test summary :: 'n' : Sample size (after NaN removal) 'outliers' : number of outliers (only for 'shepherd' or 'skipped') 'r' : Correlation coefficient 'CI95' : 95% parametric confidence intervals 'r2' : R-squared 'adj_r2' : Adjusted R-squared 'p-val' : one or two tailed p-value 'BF10' : Bayes Factor of the alternative hypothesis (Pearson only) 'power' : achieved power of the test (= 1 - type II error). See also -------- pairwise_corr : Pairwise correlation between columns of a pandas DataFrame partial_corr : Partial correlation Notes ----- The Pearson correlation coefficient measures the linear relationship between two datasets. Strictly speaking, Pearson's correlation requires that each dataset be normally distributed. Correlations of -1 or +1 imply an exact linear relationship. The Spearman correlation is a nonparametric measure of the monotonicity of the relationship between two datasets. Unlike the Pearson correlation, the Spearman correlation does not assume that both datasets are normally distributed. Correlations of -1 or +1 imply an exact monotonic relationship. Kendall’s tau is a measure of the correspondence between two rankings. Values close to 1 indicate strong agreement, values close to -1 indicate strong disagreement. The percentage bend correlation [1]_ is a robust method that protects against univariate outliers. The Shepherd's pi [2]_ and skipped [3]_, [4]_ correlations are both robust methods that returns the Spearman's rho after bivariate outliers removal. Note that the skipped correlation requires that the scikit-learn package is installed (for computing the minimum covariance determinant). Please note that rows with NaN are automatically removed. If ``method='pearson'``, The Bayes Factor is calculated using the :py:func:`pingouin.bayesfactor_pearson` function. References ---------- .. [1] Wilcox, R.R., 1994. The percentage bend correlation coefficient. Psychometrika 59, 601–616. https://doi.org/10.1007/BF02294395 .. [2] Schwarzkopf, D.S., De Haas, B., Rees, G., 2012. Better ways to improve standards in brain-behavior correlation analysis. Front. Hum. Neurosci. 6, 200. https://doi.org/10.3389/fnhum.2012.00200 .. [3] Rousselet, G.A., Pernet, C.R., 2012. Improving standards in brain-behavior correlation analyses. Front. Hum. Neurosci. 6, 119. https://doi.org/10.3389/fnhum.2012.00119 .. [4] Pernet, C.R., Wilcox, R., Rousselet, G.A., 2012. Robust correlation analyses: false positive and power validation using a new open source matlab toolbox. Front. Psychol. 3, 606. https://doi.org/10.3389/fpsyg.2012.00606 Examples -------- 1. Pearson correlation >>> import numpy as np >>> # Generate random correlated samples >>> np.random.seed(123) >>> mean, cov = [4, 6], [(1, .5), (.5, 1)] >>> x, y = np.random.multivariate_normal(mean, cov, 30).T >>> # Compute Pearson correlation >>> from pingouin import corr >>> corr(x, y) n r CI95% r2 adj_r2 p-val BF10 power pearson 30 0.491 [0.16, 0.72] 0.242 0.185 0.005813 8.55 0.809 2. Pearson correlation with two outliers >>> x[3], y[5] = 12, -8 >>> corr(x, y) n r CI95% r2 adj_r2 p-val BF10 power pearson 30 0.147 [-0.23, 0.48] 0.022 -0.051 0.439148 0.302 0.121 3. Spearman correlation >>> corr(x, y, method="spearman") n r CI95% r2 adj_r2 p-val power spearman 30 0.401 [0.05, 0.67] 0.161 0.099 0.028034 0.61 4. Percentage bend correlation (robust) >>> corr(x, y, method='percbend') n r CI95% r2 adj_r2 p-val power percbend 30 0.389 [0.03, 0.66] 0.151 0.089 0.033508 0.581 5. Shepherd's pi correlation (robust) >>> corr(x, y, method='shepherd') n outliers r CI95% r2 adj_r2 p-val power shepherd 30 2 0.437 [0.09, 0.69] 0.191 0.131 0.020128 0.694 6. Skipped spearman correlation (robust) >>> corr(x, y, method='skipped') n outliers r CI95% r2 adj_r2 p-val power skipped 30 2 0.437 [0.09, 0.69] 0.191 0.131 0.020128 0.694 7. One-tailed Pearson correlation >>> corr(x, y, tail="one-sided", method='pearson') n r CI95% r2 adj_r2 p-val BF10 power pearson 30 0.147 [-0.23, 0.48] 0.022 -0.051 0.219574 0.467 0.194 8. Using columns of a pandas dataframe >>> import pandas as pd >>> data = pd.DataFrame({'x': x, 'y': y}) >>> corr(data['x'], data['y']) n r CI95% r2 adj_r2 p-val BF10 power pearson 30 0.147 [-0.23, 0.48] 0.022 -0.051 0.439148 0.302 0.121 """ x = np.asarray(x) y = np.asarray(y) # Check size if x.size != y.size: raise ValueError('x and y must have the same length.') # Remove NA x, y = remove_na(x, y, paired=True) nx = x.size # Compute correlation coefficient if method == 'pearson': r, pval = pearsonr(x, y) elif method == 'spearman': r, pval = spearmanr(x, y) elif method == 'kendall': r, pval = kendalltau(x, y) elif method == 'percbend': r, pval = percbend(x, y) elif method == 'shepherd': r, pval, outliers = shepherd(x, y) elif method == 'skipped': r, pval, outliers = skipped(x, y, method='spearman') else: raise ValueError('Method not recognized.') assert not np.isnan(r), 'Correlation returned NaN. Check your data.' # Compute r2 and adj_r2 r2 = r**2 adj_r2 = 1 - (((1 - r2) * (nx - 1)) / (nx - 3)) # Compute the parametric 95% confidence interval and power if r2 < 1: ci = compute_esci(stat=r, nx=nx, ny=nx, eftype='r') pr = round(power_corr(r=r, n=nx, power=None, alpha=0.05, tail=tail), 3) else: ci = [1., 1.] pr = np.inf # Create dictionnary stats = { 'n': nx, 'r': round(r, 3), 'r2': round(r2, 3), 'adj_r2': round(adj_r2, 3), 'CI95%': [ci], 'p-val': pval if tail == 'two-sided' else .5 * pval, 'power': pr } if method in ['shepherd', 'skipped']: stats['outliers'] = sum(outliers) # Compute the BF10 for Pearson correlation only if method == 'pearson': if r2 < 1: stats['BF10'] = bayesfactor_pearson(r, nx, tail=tail) else: stats['BF10'] = str(np.inf) # Convert to DataFrame stats = pd.DataFrame.from_records(stats, index=[method]) # Define order col_keep = [ 'n', 'outliers', 'r', 'CI95%', 'r2', 'adj_r2', 'p-val', 'BF10', 'power' ] col_order = [k for k in col_keep if k in stats.keys().tolist()] return stats[col_order]
def rm_corr(data=None, x=None, y=None, subject=None, tail='two-sided'): """Repeated measures correlation. Parameters ---------- data : pd.DataFrame Dataframe. x, y : string Name of columns in ``data`` containing the two dependent variables. subject : string Name of column in ``data`` containing the subject indicator. tail : string Specify whether to return 'one-sided' or 'two-sided' p-value. Returns ------- stats : pandas DataFrame Test summary :: 'r' : Repeated measures correlation coefficient 'dof' : Degrees of freedom 'pval' : one or two tailed p-value 'CI95' : 95% parametric confidence intervals 'power' : achieved power of the test (= 1 - type II error). Notes ----- Repeated measures correlation (rmcorr) is a statistical technique for determining the common within-individual association for paired measures assessed on two or more occasions for multiple individuals. From Bakdash and Marusich (2017): "Rmcorr accounts for non-independence among observations using analysis of covariance (ANCOVA) to statistically adjust for inter-individual variability. By removing measured variance between-participants, rmcorr provides the best linear fit for each participant using parallel regression lines (the same slope) with varying intercepts. Like a Pearson correlation coefficient, the rmcorr coefficient is bounded by − 1 to 1 and represents the strength of the linear association between two variables." Results have been tested against the `rmcorr` R package. Please note that NaN are automatically removed from the dataframe (listwise deletion). References ---------- .. [1] Bakdash, J.Z., Marusich, L.R., 2017. Repeated Measures Correlation. Front. Psychol. 8, 456. https://doi.org/10.3389/fpsyg.2017.00456 .. [2] Bland, J. M., & Altman, D. G. (1995). Statistics notes: Calculating correlation coefficients with repeated observations: Part 1—correlation within subjects. Bmj, 310(6977), 446. .. [3] https://github.com/cran/rmcorr Examples -------- >>> import pingouin as pg >>> df = pg.read_dataset('rm_corr') >>> pg.rm_corr(data=df, x='pH', y='PacO2', subject='Subject') r dof pval CI95% power rm_corr -0.507 38 0.000847 [-0.71, -0.23] 0.93 """ from pingouin import ancova, power_corr # Safety checks assert isinstance(data, pd.DataFrame), 'Data must be a DataFrame' assert x in data.columns, 'The %s column is not in data.' % x assert y in data.columns, 'The %s column is not in data.' % y assert data[x].dtype.kind in 'bfi', '%s must be numeric.' % x assert data[y].dtype.kind in 'bfi', '%s must be numeric.' % y assert subject in data.columns, 'The %s column is not in data.' % subject if data[subject].nunique() < 3: raise ValueError('rm_corr requires at least 3 unique subjects.') # Remove missing values data = data[[x, y, subject]].dropna(axis=0) # Using PINGOUIN aov = ancova(dv=y, covar=x, between=subject, data=data) bw = aov.bw_ # Beta within parameter sign = np.sign(bw) dof = int(aov.at[2, 'DF']) n = dof + 2 ssfactor = aov.at[1, 'SS'] sserror = aov.at[2, 'SS'] rm = sign * np.sqrt(ssfactor / (ssfactor + sserror)) pval = aov.at[1, 'p-unc'] pval = pval * 0.5 if tail == 'one-sided' else pval ci = compute_esci(stat=rm, nx=n, eftype='pearson').tolist() pwr = power_corr(r=rm, n=n, tail=tail) # Convert to Dataframe stats = pd.DataFrame( { "r": round(rm, 3), "dof": int(dof), "pval": pval, "CI95%": str(ci), "power": round(pwr, 3) }, index=["rm_corr"]) return stats
def corr(x, y, tail='two-sided', method='pearson'): """(Robust) correlation between two variables. Parameters ---------- x, y : array_like First and second set of observations. ``x`` and ``y`` must be independent. tail : string Specify whether to return ``'one-sided'`` or ``'two-sided'`` p-value. Note that the former are simply half the latter. method : string Correlation type: * ``'pearson'``: Pearson :math:`r` product-moment correlation * ``'spearman'``: Spearman :math:`\\rho` rank-order correlation * ``'kendall'``: Kendall's :math:`\\tau` correlation (for ordinal data) * ``'bicor'``: Biweight midcorrelation (robust) * ``'percbend'``: Percentage bend correlation (robust) * ``'shepherd'``: Shepherd's pi correlation (robust) * ``'skipped'``: Skipped correlation (robust) Returns ------- stats : :py:class:`pandas.DataFrame` * ``'n'``: Sample size (after removal of missing values) * ``'outliers'``: number of outliers, only if a robust method was used * ``'r'``: Correlation coefficient * ``'CI95'``: 95% parametric confidence intervals around :math:`r` * ``'r2'``: R-squared (:math:`= r^2`) * ``'adj_r2'``: Adjusted R-squared * ``'p-val'``: tail of the test * ``'BF10'``: Bayes Factor of the alternative hypothesis (only for Pearson correlation) * ``'power'``: achieved power of the test (= 1 - type II error). See also -------- pairwise_corr : Pairwise correlation between columns of a pandas DataFrame partial_corr : Partial correlation rm_corr : Repeated measures correlation Notes ----- The `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_ measures the linear relationship between two datasets. Strictly speaking, Pearson's correlation requires that each dataset be normally distributed. Correlations of -1 or +1 imply a perfect negative and positive linear relationship, respectively, with 0 indicating the absence of association. .. math:: r_{xy} = \\frac{\\sum_i(x_i - \\bar{x})(y_i - \\bar{y})} {\\sqrt{\\sum_i(x_i - \\bar{x})^2} \\sqrt{\\sum_i(y_i - \\bar{y})^2}} = \\frac{\\text{cov}(x, y)}{\\sigma_x \\sigma_y} where :math:`\\text{cov}` is the sample covariance and :math:`\\sigma` is the sample standard deviation. If ``method='pearson'``, The Bayes Factor is calculated using the :py:func:`pingouin.bayesfactor_pearson` function. The `Spearman correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_ is a non-parametric measure of the monotonicity of the relationship between two datasets. Unlike the Pearson correlation, the Spearman correlation does not assume that both datasets are normally distributed. Correlations of -1 or +1 imply an exact negative and positive monotonic relationship, respectively. Mathematically, the Spearman correlation coefficient is defined as the Pearson correlation coefficient between the `rank variables <https://en.wikipedia.org/wiki/Ranking>`_. The `Kendall correlation coefficient <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_ is a measure of the correspondence between two rankings. Values also range from -1 (perfect disagreement) to 1 (perfect agreement), with 0 indicating the absence of association. Consistent with :py:func:`scipy.stats.kendalltau`, Pingouin returns the Tau-b coefficient, which adjusts for ties: .. math:: \\tau_B = \\frac{(P - Q)}{\\sqrt{(P + Q + T) (P + Q + U)}} where :math:`P` is the number of concordant pairs, :math:`Q` the number of discordand pairs, :math:`T` the number of ties in x, and :math:`U` the number of ties in y. The `biweight midcorrelation <https://en.wikipedia.org/wiki/Biweight_midcorrelation>`_ and percentage bend correlation [1]_ are both robust methods that protects against *univariate* outliers by down-weighting observations that deviate too much from the median. The Shepherd pi [2]_ correlation and skipped [3]_, [4]_ correlation are both robust methods that returns the Spearman correlation coefficient after removing *bivariate* outliers. Briefly, the Shepherd pi uses a bootstrapping of the Mahalanobis distance to identify outliers, while the skipped correlation is based on the minimum covariance determinant (which requires scikit-learn). Note that these two methods are significantly slower than the previous ones. .. important:: Please note that rows with missing values (NaN) are automatically removed. References ---------- .. [1] Wilcox, R.R., 1994. The percentage bend correlation coefficient. Psychometrika 59, 601–616. https://doi.org/10.1007/BF02294395 .. [2] Schwarzkopf, D.S., De Haas, B., Rees, G., 2012. Better ways to improve standards in brain-behavior correlation analysis. Front. Hum. Neurosci. 6, 200. https://doi.org/10.3389/fnhum.2012.00200 .. [3] Rousselet, G.A., Pernet, C.R., 2012. Improving standards in brain-behavior correlation analyses. Front. Hum. Neurosci. 6, 119. https://doi.org/10.3389/fnhum.2012.00119 .. [4] Pernet, C.R., Wilcox, R., Rousselet, G.A., 2012. Robust correlation analyses: false positive and power validation using a new open source matlab toolbox. Front. Psychol. 3, 606. https://doi.org/10.3389/fpsyg.2012.00606 Examples -------- 1. Pearson correlation >>> import numpy as np >>> import pingouin as pg >>> # Generate random correlated samples >>> np.random.seed(123) >>> mean, cov = [4, 6], [(1, .5), (.5, 1)] >>> x, y = np.random.multivariate_normal(mean, cov, 30).T >>> # Compute Pearson correlation >>> pg.corr(x, y).round(3) n r CI95% r2 adj_r2 p-val BF10 power pearson 30 0.491 [0.16, 0.72] 0.242 0.185 0.006 8.55 0.809 2. Pearson correlation with two outliers >>> x[3], y[5] = 12, -8 >>> pg.corr(x, y).round(3) n r CI95% r2 adj_r2 p-val BF10 power pearson 30 0.147 [-0.23, 0.48] 0.022 -0.051 0.439 0.302 0.121 3. Spearman correlation (robust to outliers) >>> pg.corr(x, y, method="spearman").round(3) n r CI95% r2 adj_r2 p-val power spearman 30 0.401 [0.05, 0.67] 0.161 0.099 0.028 0.61 4. Biweight midcorrelation (robust) >>> pg.corr(x, y, method="bicor").round(3) n r CI95% r2 adj_r2 p-val power bicor 30 0.393 [0.04, 0.66] 0.155 0.092 0.031 0.592 5. Percentage bend correlation (robust) >>> pg.corr(x, y, method='percbend').round(3) n r CI95% r2 adj_r2 p-val power percbend 30 0.389 [0.03, 0.66] 0.151 0.089 0.034 0.581 6. Shepherd's pi correlation (robust) >>> pg.corr(x, y, method='shepherd').round(3) n outliers r CI95% r2 adj_r2 p-val power shepherd 30 2 0.437 [0.09, 0.69] 0.191 0.131 0.02 0.694 7. Skipped spearman correlation (robust) >>> pg.corr(x, y, method='skipped').round(3) n outliers r CI95% r2 adj_r2 p-val power skipped 30 2 0.437 [0.09, 0.69] 0.191 0.131 0.02 0.694 8. One-tailed Pearson correlation >>> pg.corr(x, y, tail="one-sided", method='pearson').round(3) n r CI95% r2 adj_r2 p-val BF10 power pearson 30 0.147 [-0.23, 0.48] 0.022 -0.051 0.22 0.467 0.194 9. Using columns of a pandas dataframe >>> import pandas as pd >>> data = pd.DataFrame({'x': x, 'y': y}) >>> pg.corr(data['x'], data['y']).round(3) n r CI95% r2 adj_r2 p-val BF10 power pearson 30 0.147 [-0.23, 0.48] 0.022 -0.051 0.439 0.302 0.121 """ # Safety check x = np.asarray(x) y = np.asarray(y) assert x.ndim == y.ndim == 1, 'x and y must be 1D array.' assert x.size == y.size, 'x and y must have the same length.' # Remove rows with missing values x, y = remove_na(x, y, paired=True) nx = x.size # Compute correlation coefficient if method == 'pearson': r, pval = pearsonr(x, y) elif method == 'spearman': r, pval = spearmanr(x, y) elif method == 'kendall': r, pval = kendalltau(x, y) elif method == 'bicor': r, pval = bicor(x, y) elif method == 'percbend': r, pval = percbend(x, y) elif method == 'shepherd': r, pval, outliers = shepherd(x, y) elif method == 'skipped': r, pval, outliers = skipped(x, y) else: raise ValueError('Method not recognized.') if np.isnan(r): # Correlation failed -- new in version v0.3.4, instead of raising an # error we just return a dataframe full of NaN (except sample size). # This avoid sudden stop in pingouin.pairwise_corr. return pd.DataFrame( { 'n': nx, 'r': np.nan, 'CI95%': np.nan, 'r2': np.nan, 'adj_r2': np.nan, 'p-val': np.nan, 'BF10': np.nan, 'power': np.nan }, index=[method]) # Compute r2 and adj_r2 r2 = r**2 adj_r2 = 1 - (((1 - r2) * (nx - 1)) / (nx - 3)) # Compute the parametric 95% confidence interval and power ci = compute_esci(stat=r, nx=nx, ny=nx, eftype='r') pr = power_corr(r=r, n=nx, power=None, alpha=0.05, tail=tail), # Create dictionnary stats = { 'n': nx, 'r': r, 'r2': r2, 'adj_r2': adj_r2, 'CI95%': [ci], 'p-val': pval if tail == 'two-sided' else .5 * pval, 'power': pr } if method in ['shepherd', 'skipped']: stats['outliers'] = sum(outliers) # Compute the BF10 for Pearson correlation only if method == 'pearson': stats['BF10'] = bayesfactor_pearson(r, nx, tail=tail) # Convert to DataFrame stats = pd.DataFrame.from_records(stats, index=[method]) # Define order col_keep = [ 'n', 'outliers', 'r', 'CI95%', 'r2', 'adj_r2', 'p-val', 'BF10', 'power' ] col_order = [k for k in col_keep if k in stats.keys().tolist()] return stats[col_order]
def corr(x, y, tail='two-sided', method='pearson'): """(Robust) correlation between two variables. Parameters ---------- x, y : array_like First and second set of observations. x and y must be independent. tail : string Specify whether to return 'one-sided' or 'two-sided' p-value. method : string Specify which method to use for the computation of the correlation coefficient. Available methods are :: 'pearson' : Pearson product-moment correlation 'spearman' : Spearman rank-order correlation 'kendall' : Kendall’s tau (ordinal data) 'percbend' : percentage bend correlation (robust) 'shepherd' : Shepherd's pi correlation (robust Spearman) 'skipped' : skipped correlation (robust Spearman, requires sklearn) Returns ------- stats : pandas DataFrame Test summary :: 'n' : Sample size (after NaN removal) 'r' : Correlation coefficient 'CI95' : 95% parametric confidence intervals 'r2' : R-squared 'adj_r2' : Adjusted R-squared 'p-val' : one or two tailed p-value 'BF10' : Bayes Factor of the alternative hypothesis (Pearson only) Notes ----- The Pearson correlation coefficient measures the linear relationship between two datasets. Strictly speaking, Pearson's correlation requires that each dataset be normally distributed. Correlations of -1 or +1 imply an exact linear relationship. The Spearman correlation is a nonparametric measure of the monotonicity of the relationship between two datasets. Unlike the Pearson correlation, the Spearman correlation does not assume that both datasets are normally distributed. Correlations of -1 or +1 imply an exact monotonic relationship. Kendall’s tau is a measure of the correspondence between two rankings. Values close to 1 indicate strong agreement, values close to -1 indicate strong disagreement. The percentage bend correlation [1]_ is a robust method that protects against univariate outliers. The Shepherd's pi [2]_ and skipped [3]_, [4]_ correlations are both robust methods that returns the Spearman's rho after bivariate outliers removal. Note that the skipped correlation requires that the scikit-learn package is installed (for computing the minimum covariance determinant). Please note that rows with NaN are automatically removed. If method='pearson', The JZS Bayes Factor is approximated using the formula described in ref [5]_: .. math:: BF_{10} = \dfrac{\sqrt{n/2}}{\gamma(1/2)}*\int_{0}^{\infty}e((n-2)/2)* log(1+g)+(-(n-1)/2)log(1+(1-r^2)*g)+(-3/2)log(g)-n/2g where **n** is the sample size and **r** is the Pearson correlation coefficient. References ---------- .. [1] Wilcox, R.R., 1994. The percentage bend correlation coefficient. Psychometrika 59, 601–616. https://doi.org/10.1007/BF02294395 .. [2] Schwarzkopf, D.S., De Haas, B., Rees, G., 2012. Better ways to improve standards in brain-behavior correlation analysis. Front. Hum. Neurosci. 6, 200. https://doi.org/10.3389/fnhum.2012.00200 .. [3] Rousselet, G.A., Pernet, C.R., 2012. Improving standards in brain-behavior correlation analyses. Front. Hum. Neurosci. 6, 119. https://doi.org/10.3389/fnhum.2012.00119 .. [4] Pernet, C.R., Wilcox, R., Rousselet, G.A., 2012. Robust correlation analyses: false positive and power validation using a new open source matlab toolbox. Front. Psychol. 3, 606. https://doi.org/10.3389/fpsyg.2012.00606 .. [5] Wetzels, R., Wagenmakers, E.-J., 2012. A default Bayesian hypothesis test for correlations and partial correlations. Psychon. Bull. Rev. 19, 1057–1064. https://doi.org/10.3758/s13423-012-0295-x Examples -------- 1. Pearson correlation >>> # Generate random correlated samples >>> np.random.seed(123) >>> mean, cov = [4, 6], [(1, .5), (.5, 1)] >>> x, y = np.random.multivariate_normal(mean, cov, 30).T >>> # Compute Pearson correlation >>> from pingouin import corr >>> corr(x, y) method n r CI95% r2 adj_r2 p-val BF10 pearson 30 0.491 [0.16, 0.72] 0.242 0.185 0.0058 6.135 2. Pearson correlation with two outliers >>> x[3], y[5] = 12, -8 >>> corr(x, y) method n r CI95% r2 adj_r2 p-val BF10 pearson 30 0.147 [-0.23, 0.48] 0.022 -0.051 0.439 0.19 3. Spearman correlation >>> corr(x, y, method="spearman") method n r CI95% r2 adj_r2 p-val spearman 30 0.401 [0.05, 0.67] 0.161 0.099 0.028 4. Percentage bend correlation (robust) >>> corr(x, y, method='percbend') method n r CI95% r2 adj_r2 p-val percbend 30 0.389 [0.03, 0.66] 0.151 0.089 0.034 5. Shepherd's pi correlation (robust) >>> corr(x, y, method='shepherd') method n r CI95% r2 adj_r2 p-val percbend 30 0.437 [0.09, 0.69] 0.191 0.131 0.020 6. Skipped spearman correlation (robust) >>> corr(x, y, method='skipped') method n r CI95% r2 adj_r2 p-val percbend 30 0.437 [0.09, 0.69] 0.191 0.131 0.020 7. One-tailed Spearman correlation >>> corr(x, y, tail="one-sided", method='shepherd') method n r CI95% r2 adj_r2 p-val spearman 30 0.401 [0.05, 0.67] 0.161 0.099 0.014 8. Using columns of a pandas dataframe >>> import pandas as pd >>> data = pd.DataFrame({'x': x, 'y': y}) >>> corr(data['x'], data['y']) method n r CI95% r2 adj_r2 p-val BF10 pearson 30 0.147 [-0.23, 0.48] 0.022 -0.051 0.439 0.19 """ x = np.asarray(x) y = np.asarray(y) # Check size if x.size != y.size: raise ValueError('x and y must have the same length.') # Remove NA x, y = _remove_na(x, y, paired=True) nx = x.size # Compute correlation coefficient if method == 'pearson': r, pval = pearsonr(x, y) elif method == 'spearman': r, pval = spearmanr(x, y) elif method == 'kendall': r, pval = kendalltau(x, y) elif method == 'percbend': r, pval = percbend(x, y) elif method == 'shepherd': r, pval = shepherd(x, y) elif method == 'skipped': r, pval, _ = skipped(x, y, method='spearman') else: raise ValueError('Method not recognized.') # Compute adj_r2 adj_r2 = 1 - (((1 - r**2) * (nx - 1)) / (nx - 3)) # Compute the parametric 95% confidence interval ci = compute_esci(stat=r, nx=nx, ny=nx, eftype='r') stats = pd.DataFrame({}, index=[method]) stats['n'] = nx stats['r'] = np.round(r, 3) stats['CI95%'] = [ci] stats['r2'] = np.round(r**2, 3) stats['adj_r2'] = np.round(adj_r2, 3) stats['p-val'] = pval if tail == 'two-sided' else .5 * pval # Compute the BF10 for Pearson correlation only if method == 'pearson' and nx < 1000: stats['BF10'] = bayesfactor_pearson(r, nx) col_order = ['n', 'r', 'CI95%', 'r2', 'adj_r2', 'p-val', 'BF10'] stats = stats.reindex(columns=col_order) stats.dropna(how='all', axis=1, inplace=True) return stats
def rm_corr(data=None, x=None, y=None, subject=None, tail='two-sided'): """Repeated measures correlation. Parameters ---------- data : :py:class:`pandas.DataFrame` Dataframe. x, y : string Name of columns in ``data`` containing the two dependent variables. subject : string Name of column in ``data`` containing the subject indicator. tail : string Specify whether to return 'one-sided' or 'two-sided' p-value. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'r'``: Repeated measures correlation coefficient * ``'dof'``: Degrees of freedom * ``'pval'``: one or two tailed p-value * ``'CI95'``: 95% parametric confidence intervals * ``'power'``: achieved power of the test (= 1 - type II error). See also -------- plot_rm_corr Notes ----- Repeated measures correlation (rmcorr) is a statistical technique for determining the common within-individual association for paired measures assessed on two or more occasions for multiple individuals. From `Bakdash and Marusich (2017) <https://doi.org/10.3389/fpsyg.2017.00456>`_: *Rmcorr accounts for non-independence among observations using analysis of covariance (ANCOVA) to statistically adjust for inter-individual variability. By removing measured variance between-participants, rmcorr provides the best linear fit for each participant using parallel regression lines (the same slope) with varying intercepts. Like a Pearson correlation coefficient, the rmcorr coefficient is bounded by − 1 to 1 and represents the strength of the linear association between two variables.* Results have been tested against the `rmcorr <https://github.com/cran/rmcorr>`_ R package. Please note that missing values are automatically removed from the dataframe (listwise deletion). Examples -------- >>> import pingouin as pg >>> df = pg.read_dataset('rm_corr') >>> pg.rm_corr(data=df, x='pH', y='PacO2', subject='Subject') r dof pval CI95% power rm_corr -0.50677 38 0.000847 [-0.71, -0.23] 0.929579 Now plot using the :py:func:`pingouin.plot_rm_corr` function: .. plot:: >>> import pingouin as pg >>> df = pg.read_dataset('rm_corr') >>> g = pg.plot_rm_corr(data=df, x='pH', y='PacO2', subject='Subject') """ from pingouin import ancova, power_corr # Safety checks assert isinstance(data, pd.DataFrame), 'Data must be a DataFrame' assert x in data.columns, 'The %s column is not in data.' % x assert y in data.columns, 'The %s column is not in data.' % y assert data[x].dtype.kind in 'bfiu', '%s must be numeric.' % x assert data[y].dtype.kind in 'bfiu', '%s must be numeric.' % y assert subject in data.columns, 'The %s column is not in data.' % subject if data[subject].nunique() < 3: raise ValueError('rm_corr requires at least 3 unique subjects.') # Remove missing values data = data[[x, y, subject]].dropna(axis=0) # Using PINGOUIN # For max precision, make sure rounding is disabled old_options = options.copy() options['round'] = None aov = ancova(dv=y, covar=x, between=subject, data=data) options.update(old_options) # restore options bw = aov.bw_ # Beta within parameter sign = np.sign(bw) dof = int(aov.at[2, 'DF']) n = dof + 2 ssfactor = aov.at[1, 'SS'] sserror = aov.at[2, 'SS'] rm = sign * np.sqrt(ssfactor / (ssfactor + sserror)) pval = aov.at[1, 'p-unc'] pval = pval * 0.5 if tail == 'one-sided' else pval ci = compute_esci(stat=rm, nx=n, eftype='pearson').tolist() pwr = power_corr(r=rm, n=n, tail=tail) # Convert to Dataframe stats = pd.DataFrame({"r": rm, "dof": int(dof), "pval": pval, "CI95%": [ci], "power": pwr}, index=["rm_corr"]) return _postprocess_dataframe(stats)
def partial_corr(data=None, x=None, y=None, covar=None, x_covar=None, y_covar=None, alternative='two-sided', method='pearson'): """Partial and semi-partial correlation. Parameters ---------- data : :py:class:`pandas.DataFrame` Pandas Dataframe. Note that this function can also directly be used as a :py:class:`pandas.DataFrame` method, in which case this argument is no longer needed. x, y : string x and y. Must be names of columns in ``data``. covar : string or list Covariate(s). Must be a names of columns in ``data``. Use a list if there are two or more covariates. x_covar : string or list Covariate(s) for the ``x`` variable. This is used to compute semi-partial correlation (i.e. the effect of ``x_covar`` is removed from ``x`` but not from ``y``). Only one of ``covar``, ``x_covar`` and ``y_covar`` can be specified. y_covar : string or list Covariate(s) for the ``y`` variable. This is used to compute semi-partial correlation (i.e. the effect of ``y_covar`` is removed from ``y`` but not from ``x``). Only one of ``covar``, ``x_covar`` and ``y_covar`` can be specified. alternative : string Defines the alternative hypothesis, or tail of the partial correlation. Must be one of "two-sided" (default), "greater" or "less". Both "greater" and "less" return a one-sided p-value. "greater" tests against the alternative hypothesis that the partial correlation is positive (greater than zero), "less" tests against the hypothesis that the partial correlation is negative. method : string Correlation type: * ``'pearson'``: Pearson :math:`r` product-moment correlation * ``'spearman'``: Spearman :math:`\\rho` rank-order correlation Returns ------- stats : :py:class:`pandas.DataFrame` * ``'n'``: Sample size (after removal of missing values) * ``'r'``: Partial correlation coefficient * ``'CI95'``: 95% parametric confidence intervals around :math:`r` * ``'p-val'``: p-value See also -------- corr, pcorr, pairwise_corr, rm_corr Notes ----- Partial correlation [1]_ measures the degree of association between ``x`` and ``y``, after removing the effect of one or more controlling variables (``covar``, or :math:`Z`). Practically, this is achieved by calculating the correlation coefficient between the residuals of two linear regressions: .. math:: x \\sim Z, y \\sim Z Like the correlation coefficient, the partial correlation coefficient takes on a value in the range from –1 to 1, where 1 indicates a perfect positive association. The semipartial correlation is similar to the partial correlation, with the exception that the set of controlling variables is only removed for either ``x`` or ``y``, but not both. Pingouin uses the method described in [2]_ to calculate the (semi)partial correlation coefficients and associated p-values. This method is based on the inverse covariance matrix and is significantly faster than the traditional regression-based method. Results have been tested against the `ppcor <https://cran.r-project.org/web/packages/ppcor/index.html>`_ R package. .. important:: Rows with missing values are automatically removed from data. References ---------- .. [1] https://en.wikipedia.org/wiki/Partial_correlation .. [2] https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4681537/ Examples -------- 1. Partial correlation with one covariate >>> import pingouin as pg >>> df = pg.read_dataset('partial_corr') >>> pg.partial_corr(data=df, x='x', y='y', covar='cv1').round(3) n r CI95% p-val pearson 30 0.568 [0.25, 0.77] 0.001 2. Spearman partial correlation with several covariates >>> # Partial correlation of x and y controlling for cv1, cv2 and cv3 >>> pg.partial_corr(data=df, x='x', y='y', covar=['cv1', 'cv2', 'cv3'], ... method='spearman').round(3) n r CI95% p-val spearman 30 0.521 [0.18, 0.75] 0.005 3. Same but one-sided test >>> pg.partial_corr(data=df, x='x', y='y', covar=['cv1', 'cv2', 'cv3'], ... alternative="greater", method='spearman').round(3) n r CI95% p-val spearman 30 0.521 [0.24, 1.0] 0.003 >>> pg.partial_corr(data=df, x='x', y='y', covar=['cv1', 'cv2', 'cv3'], ... alternative="less", method='spearman').round(3) n r CI95% p-val spearman 30 0.521 [-1.0, 0.72] 0.997 4. As a pandas method >>> df.partial_corr(x='x', y='y', covar=['cv1'], method='spearman').round(3) n r CI95% p-val spearman 30 0.578 [0.27, 0.78] 0.001 5. Partial correlation matrix (returns only the correlation coefficients) >>> df.pcorr().round(3) x y cv1 cv2 cv3 x 1.000 0.493 -0.095 0.130 -0.385 y 0.493 1.000 -0.007 0.104 -0.002 cv1 -0.095 -0.007 1.000 -0.241 -0.470 cv2 0.130 0.104 -0.241 1.000 -0.118 cv3 -0.385 -0.002 -0.470 -0.118 1.000 6. Semi-partial correlation on x >>> pg.partial_corr(data=df, x='x', y='y', x_covar=['cv1', 'cv2', 'cv3']).round(3) n r CI95% p-val pearson 30 0.463 [0.1, 0.72] 0.015 """ from pingouin.utils import _flatten_list # Safety check assert alternative in [ 'two-sided', 'greater', 'less' ], ("Alternative must be one of 'two-sided' (default), 'greater' or 'less'." ) assert method in [ 'pearson', 'spearman' ], ('only "pearson" and "spearman" are supported for partial correlation.') assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.' assert data.shape[0] > 2, 'Data must have at least 3 samples.' if covar is not None and (x_covar is not None or y_covar is not None): raise ValueError('Cannot specify both covar and {x,y}_covar.') if x_covar is not None and y_covar is not None: raise ValueError('Cannot specify both x_covar and y_covar.') assert x != covar, 'x and covar must be independent' assert y != covar, 'y and covar must be independent' assert x != y, 'x and y must be independent' if isinstance(covar, list): assert x not in covar, 'x and covar must be independent' assert y not in covar, 'y and covar must be independent' # Check that columns exist col = _flatten_list([x, y, covar, x_covar, y_covar]) assert all([c in data for c in col]), 'columns are not in dataframe.' # Check that columns are numeric assert all([data[c].dtype.kind in 'bfiu' for c in col]) # Drop rows with NaN data = data[col].dropna() n = data.shape[0] # Number of samples k = data.shape[1] - 2 # Number of covariates assert n > 2, 'Data must have at least 3 non-NAN samples.' # Calculate the partial corrrelation matrix - similar to pingouin.pcorr() if method == "spearman": # Convert the data to rank, similar to R cov() V = data.rank(na_option='keep').cov() else: V = data.cov() Vi = np.linalg.pinv(V, hermitian=True) # Inverse covariance matrix Vi_diag = Vi.diagonal() D = np.diag(np.sqrt(1 / Vi_diag)) pcor = -1 * (D @ Vi @ D) # Partial correlation matrix if covar is not None: r = pcor[0, 1] else: # Semi-partial correlation matrix with np.errstate(divide='ignore'): spcor = pcor / \ np.sqrt(np.diag(V))[..., None] / \ np.sqrt(np.abs(Vi_diag - Vi ** 2 / Vi_diag[..., None])).T if y_covar is not None: r = spcor[0, 1] # y_covar is removed from y else: r = spcor[1, 0] # x_covar is removed from x if np.isnan(r): # Correlation failed. Return NaN. When would this happen? return pd.DataFrame( { 'n': n, 'r': np.nan, 'CI95%': np.nan, 'p-val': np.nan }, index=[method]) # Compute the two-sided p-value and confidence intervals # https://online.stat.psu.edu/stat505/lesson/6/6.3 pval = _correl_pvalue(r, n, k, alternative) ci = compute_esci(stat=r, nx=(n - k), ny=(n - k), eftype='r', decimals=6, alternative=alternative) # Create dictionnary stats = { 'n': n, 'r': r, 'CI95%': [ci], 'p-val': pval, } # Convert to DataFrame stats = pd.DataFrame(stats, index=[method]) # Define order col_keep = ['n', 'r', 'CI95%', 'p-val'] col_order = [k for k in col_keep if k in stats.keys().tolist()] return _postprocess_dataframe(stats)[col_order]
def partial_corr(data=None, x=None, y=None, covar=None, x_covar=None, y_covar=None, tail='two-sided', method='pearson', **kwargs): """Partial and semi-partial correlation. Parameters ---------- data : :py:class:`pandas.DataFrame` Panddas Dataframe. Note that this function can also directly be used as a :py:class:`pandas.DataFrame` method, in which case this argument is no longer needed. x, y : string x and y. Must be names of columns in ``data``. covar : string or list Covariate(s). Must be a names of columns in ``data``. Use a list if there are two or more covariates. x_covar : string or list Covariate(s) for the ``x`` variable. This is used to compute semi-partial correlation (i.e. the effect of ``x_covar`` is removed from ``x`` but not from ``y``). Only one of ``covar``, ``x_covar`` and ``y_covar`` can be specified. y_covar : string or list Covariate(s) for the ``y`` variable. This is used to compute semi-partial correlation (i.e. the effect of ``y_covar`` is removed from ``y`` but not from ``x``). Only one of ``covar``, ``x_covar`` and ``y_covar`` can be specified. tail : string Specify whether to return `'one-sided'` or `'two-sided'` p-value. The former are simply half the latter. method : string Correlation type: * ``'pearson'``: Pearson :math:`r` product-moment correlation * ``'spearman'``: Spearman :math:`\\rho` rank-order correlation * ``'kendall'``: Kendall's :math:`\\tau_B` correlation (for ordinal data) * ``'bicor'``: Biweight midcorrelation (robust) * ``'percbend'``: Percentage bend correlation (robust) * ``'shepherd'``: Shepherd's pi correlation (robust) * ``'skipped'``: Skipped correlation (robust) **kwargs : optional Optional argument(s) passed to the lower-level correlation functions. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'n'``: Sample size (after removal of missing values) * ``'outliers'``: number of outliers, only if a robust method was used * ``'r'``: Correlation coefficient * ``'CI95'``: 95% parametric confidence intervals around :math:`r` * ``'p-val'``: tail of the test See also -------- corr, pairwise_corr, rm_corr Notes ----- From [1]_: *With partial correlation, we find the correlation between x and y holding C constant for both x and y. Sometimes, however, we want to hold C constant for just x or just y. In that case, we compute a semi-partial correlation. A partial correlation is computed between two residuals. A semi-partial correlation is computed between one residual and another raw (or unresidualized) variable.* Note that if you are not interested in calculating the p-values [2]_ but only the partial correlation matrix, a faster alternative is to use :py:func:`pingouin.pcorr` (see example 4). Rows with missing values are automatically removed from data. Results have been tested against the `ppcor <https://cran.r-project.org/web/packages/ppcor/index.html>`_ R package. References ---------- .. [1] http://faculty.cas.usf.edu/mbrannick/regression/Partial.html .. [2] https://online.stat.psu.edu/stat505/lesson/6/6.3 Examples -------- 1. Partial correlation with one covariate >>> import pingouin as pg >>> df = pg.read_dataset('partial_corr') >>> pg.partial_corr(data=df, x='x', y='y', covar='cv1').round(3) n r CI95% p-val pearson 30 0.568 [0.25, 0.77] 0.001 2. Spearman partial correlation with several covariates >>> # Partial correlation of x and y controlling for cv1, cv2 and cv3 >>> pg.partial_corr(data=df, x='x', y='y', covar=['cv1', 'cv2', 'cv3'], ... method='spearman').round(3) n r CI95% p-val spearman 30 0.491 [0.14, 0.73] 0.009 3. As a pandas method >>> df.partial_corr(x='x', y='y', covar=['cv1'], ... method='spearman').round(3) n r CI95% p-val spearman 30 0.568 [0.26, 0.77] 0.001 4. Partial correlation matrix (returns only the correlation coefficients) >>> df.pcorr().round(3) x y cv1 cv2 cv3 x 1.000 0.493 -0.095 0.130 -0.385 y 0.493 1.000 -0.007 0.104 -0.002 cv1 -0.095 -0.007 1.000 -0.241 -0.470 cv2 0.130 0.104 -0.241 1.000 -0.118 cv3 -0.385 -0.002 -0.470 -0.118 1.000 5. Semi-partial correlation on x >>> pg.partial_corr(data=df, x='x', y='y', ... x_covar=['cv1', 'cv2', 'cv3']).round(3) n r CI95% p-val pearson 30 0.463 [0.1, 0.72] 0.015 """ from pingouin.utils import _flatten_list # Safety check assert tail in ['two-sided', 'one-sided'], ( 'tail must be "two-sided" or "one-sided".') assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.' assert data.shape[0] > 2, 'Data must have at least 3 samples.' assert isinstance(x, (str, tuple)), 'x must be a string.' assert isinstance(y, (str, tuple)), 'y must be a string.' assert isinstance(covar, (str, list, type(None))) assert isinstance(x_covar, (str, list, type(None))) assert isinstance(y_covar, (str, list, type(None))) if covar is not None and (x_covar is not None or y_covar is not None): raise ValueError('Cannot specify both covar and {x,y}_covar.') if x_covar is not None and y_covar is not None: raise ValueError('Cannot specify both x_covar and y_covar.') assert x != covar, 'x and covar must be independent' assert y != covar, 'y and covar must be independent' assert x != y, 'x and y must be independent' if isinstance(covar, list): assert x not in covar, 'x and covar must be independent' assert y not in covar, 'y and covar must be independent' # Check that columns exist col = _flatten_list([x, y, covar, x_covar, y_covar]) if isinstance(covar, str): covar = [covar] if isinstance(x_covar, str): x_covar = [x_covar] if isinstance(y_covar, str): y_covar = [y_covar] assert all([c in data for c in col]), 'columns are not in dataframe.' # Check that columns are numeric assert all([data[c].dtype.kind in 'bfiu' for c in col]) # Drop rows with NaN data = data[col].dropna() n = data.shape[0] # Number of samples k = data.shape[1] - 2 # Number of covariates # dof = n - k - 2 assert n > 2, 'Data must have at least 3 non-NAN samples.' # Standardize (= no need for an intercept in least-square regression) C = (data[col] - data[col].mean(axis=0)) / data[col].std(axis=0) if covar is not None: # PARTIAL CORRELATION cvar = np.atleast_2d(C[covar].to_numpy()) beta_x = np.linalg.lstsq(cvar, C[x].to_numpy(), rcond=None)[0] beta_y = np.linalg.lstsq(cvar, C[y].to_numpy(), rcond=None)[0] res_x = C[x].to_numpy() - cvar @ beta_x res_y = C[y].to_numpy() - cvar @ beta_y else: # SEMI-PARTIAL CORRELATION # Initialize "fake" residuals res_x, res_y = data[x].to_numpy(), data[y].to_numpy() if x_covar is not None: cvar = np.atleast_2d(C[x_covar].to_numpy()) beta_x = np.linalg.lstsq(cvar, C[x].to_numpy(), rcond=None)[0] res_x = C[x].to_numpy() - cvar @ beta_x if y_covar is not None: cvar = np.atleast_2d(C[y_covar].to_numpy()) beta_y = np.linalg.lstsq(cvar, C[y].to_numpy(), rcond=None)[0] res_y = C[y].to_numpy() - cvar @ beta_y # Compute partial correlation coefficient # We do not extract the p-values at this stage because they do not account # for the number of covariates in the degrees of freedom if method == 'pearson': r, _ = pearsonr(res_x, res_y) elif method == 'spearman': r, _ = spearmanr(res_x, res_y, **kwargs) elif method == 'kendall': r, _ = kendalltau(res_x, res_y, **kwargs) elif method == 'bicor': r, _ = bicor(res_x, res_y, **kwargs) elif method == 'percbend': r, _ = percbend(res_x, res_y, **kwargs) elif method == 'shepherd': r, _, outliers = shepherd(res_x, res_y, **kwargs) elif method == 'skipped': r, _, outliers = skipped(res_x, res_y, **kwargs) else: raise ValueError(f'Method "{method}" not recognized.') if np.isnan(r): # Correlation failed -- new in version v0.3.4, instead of raising an # error we just return a dataframe full of NaN (except sample size). # This avoid sudden stop in pingouin.pairwise_corr. return pd.DataFrame({'n': n, 'r': np.nan, 'CI95%': np.nan, 'p-val': np.nan}, index=[method]) # Sample size after outlier removal n_outliers = sum(outliers) if "outliers" in locals() else 0 n_clean = n - n_outliers # Compute the two-sided p-value and confidence intervals # https://online.stat.psu.edu/stat505/lesson/6/6.3 pval = _correl_pvalue(r, n_clean, k) ci = compute_esci( stat=r, nx=(n_clean - k), ny=(n_clean - k), eftype='r', decimals=6) # Create dictionnary stats = { 'n': n, 'r': r, 'CI95%': [ci], 'p-val': pval if tail == 'two-sided' else .5 * pval, } if method in ['shepherd', 'skipped']: stats['outliers'] = n_outliers # Convert to DataFrame stats = pd.DataFrame.from_records(stats, index=[method]) # Define order col_keep = ['n', 'outliers', 'r', 'CI95%', 'p-val'] col_order = [k for k in col_keep if k in stats.keys().tolist()] return _postprocess_dataframe(stats)[col_order]