def test_corr(self): """Test function corr""" np.random.seed(123) mean, cov = [4, 6], [(1, .6), (.6, 1)] x, y = np.random.multivariate_normal(mean, cov, 30).T x[3], y[5] = 12, -8 corr(x, y, method='pearson', tail='one-sided') corr(x, y, method='spearman', tail='two-sided') corr(x, y, method='kendall') corr(x, y, method='shepherd', tail='two-sided') # Compare with robust corr toolbox stats = corr(x, y, method='skipped') assert np.round(stats['r'].to_numpy(), 3) == 0.512 assert stats['outliers'].to_numpy() == 2 # Changing the method using kwargs sk_sp = corr(x, y, method='skipped', corr_type='spearman') sk_pe = corr(x, y, method='skipped', corr_type='pearson') assert not sk_sp.equals(sk_pe) stats = corr(x, y, method='shepherd') assert stats['outliers'].to_numpy() == 2 _, _, outliers = skipped(x, y, corr_type='pearson') assert outliers.size == x.size assert stats['n'].to_numpy() == 30 stats = corr(x, y, method='percbend') assert np.round(stats['r'].to_numpy(), 3) == 0.484 # Compare biweight correlation to astropy stats = corr(x, y, method='bicor') assert np.isclose(stats['r'].to_numpy(), 0.4951417784979) # Changing the value of C using kwargs stats = corr(x, y, method='bicor', c=5) assert np.isclose(stats['r'].to_numpy(), 0.4940706950017) # Not normally distributed z = np.random.uniform(size=30) corr(x, z, method='pearson') # With NaN values x[3] = np.nan corr(x, y) # With the same array # Disabled because of AppVeyor failure # assert corr(x, x).loc['pearson', 'BF10'] == str(np.inf) # Wrong argument with pytest.raises(ValueError): corr(x, y, method='error') # Compare BF10 with JASP df = read_dataset('pairwise_corr') stats = corr(df['Neuroticism'], df['Extraversion']) assert np.isclose(1 / float(stats['BF10'].to_numpy()), 1.478e-13) # When one column is a constant, the correlation is not defined # and Pingouin return a DataFrame full of NaN, except for ``n`` x, y = [1, 1, 1], [1, 2, 3] stats = corr(x, y) assert stats.at['pearson', 'n'] assert np.isnan(stats.at['pearson', 'r']) # Biweight midcorrelation returns NaN when MAD is not defined assert np.isnan(bicor(np.array([1, 1, 1, 1, 0, 1]), np.arange(6))[0])
def test_corr(self): """Test function corr""" np.random.seed(123) mean, cov = [4, 6], [(1, .6), (.6, 1)] x, y = np.random.multivariate_normal(mean, cov, 30).T x[3], y[5] = 12, -8 corr(x, y, method='pearson', tail='one-sided') corr(x, y, method='spearman', tail='two-sided') corr(x, y, method='kendall') corr(x, y, method='shepherd', tail='two-sided') # Compare with robust corr toolbox stats = corr(x, y, method='skipped') assert stats['r'].values == 0.512 assert stats['outliers'].values == 2 stats = corr(x, y, method='shepherd') assert stats['outliers'].values == 2 _, _, outliers = skipped(x, y, method='pearson') assert outliers.size == x.size assert stats['n'].values == 30 stats = corr(x, y, method='percbend') assert stats['r'].values == 0.484 # Not normally distributed z = np.random.uniform(size=30) corr(x, z, method='pearson') # With NaN values x[3] = np.nan corr(x, y) # With the same array assert float(corr(x, x).loc['pearson', 'BF10']) == np.inf # Wrong argument with pytest.raises(ValueError): corr(x, y, method='error') with pytest.raises(ValueError): corr(x, y[:-10]) # Compare with JASP df = read_dataset('pairwise_corr') stats = corr(df['Neuroticism'], df['Extraversion']) assert np.isclose(1 / float(stats['BF10'].values), 1.478e-13) # With more than 100 values to see if BF10 is computed xx, yy = np.random.multivariate_normal(mean, cov, 1500).T c1500 = corr(xx, yy) assert 'BF10' not in c1500.keys()
def plot_skipped_corr(x, y, xlabel=None, ylabel=None, n_boot=2000, seed=None): """Plot the bootstrapped 95% confidence intervals and distribution of a robust Skipped correlation. Parameters ---------- x, y : 1D-arrays or list Samples xlabel, ylabel : str Axes labels n_boot : int Number of bootstrap iterations for the computation of the confidence intervals seed : int Random seed generator for the bootstrap confidence intervals. Returns -------- fig : matplotlib Figure instance Matplotlib Figure. To get the individual axes, use fig.axes. Notes ----- This function is inspired by the Matlab Robust Correlation Toolbox (Pernet, Wilcox and Rousselet, 2012). It uses the skipped correlation to determine the outliers. Note that this function requires the scikit-learn package. References ---------- .. [1] Pernet, C.R., Wilcox, R., Rousselet, G.A., 2012. Robust correlation analyses: false positive and power validation using a new open source matlab toolbox. Front. Psychol. 3, 606. https://doi.org/10.3389/fpsyg.2012.00606 Examples -------- Plot a robust Skipped correlation with bootstrapped confidence intervals .. plot:: >>> import numpy as np >>> import pingouin as pg >>> np.random.seed(123) >>> mean, cov, n = [170, 70], [[20, 10], [10, 20]], 30 >>> x, y = np.random.multivariate_normal(mean, cov, n).T >>> # Introduce two outliers >>> x[10], y[10] = 160, 100 >>> x[8], y[8] = 165, 90 >>> fig = pg.plot_skipped_corr(x, y, xlabel='Height', ylabel='Weight') """ from pingouin.correlation import skipped from scipy.stats import pearsonr from pingouin.effsize import compute_bootci # Safety check x = np.asarray(x) y = np.asarray(y) assert x.size == y.size # Skipped Spearman / Pearson correlations r, p, outliers = skipped(x, y, method='spearman') r_pearson, _ = pearsonr(x[~outliers], y[~outliers]) # Bootstrapped skipped Spearman distribution & CI spearman_ci, spearman_dist = compute_bootci(x=x[~outliers], y=y[~outliers], func='spearman', n_boot=n_boot, return_dist=True, seed=seed) # Bootstrapped skipped Pearson distribution & CI pearson_ci, pearson_dist = compute_bootci(x=x[~outliers], y=y[~outliers], func='pearson', n_boot=n_boot, return_dist=True, seed=seed) # START THE PLOT fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12, 4.2)) # plt.subplots_adjust(wspace=0.3) sns.despine() # Scatter plot and regression lines sns.regplot(x[~outliers], y[~outliers], ax=ax1, color='darkcyan') ax1.scatter(x[outliers], y[outliers], color='indianred', label='outliers') ax1.scatter(x[~outliers], y[~outliers], color='seagreen', label='good') # Labels xlabel = 'x' if xlabel is None else xlabel ylabel = 'y' if ylabel is None else ylabel ax1.set_xlabel(xlabel) ax1.set_ylabel(ylabel) ax1.set_title('Outliers (n={})'.format(sum(outliers)), y=1.05) # Spearman distribution sns.distplot(spearman_dist, kde=True, ax=ax2, color='darkcyan') for i in spearman_ci: ax2.axvline(x=i, color='coral', lw=2) ax2.axvline(x=0, color='k', ls='--', lw=1.5) ax2.set_ylabel('Density of bootstrap samples') ax2.set_xlabel('Correlation coefficient') ax2.set_title('Skipped Spearman r = {}\n95% CI = [{}, {}]'.format( r.round(2), spearman_ci[0], spearman_ci[1]), y=1.05) # Pearson dististribution sns.distplot(pearson_dist, kde=True, ax=ax3, color='steelblue') for i in pearson_ci: ax3.axvline(x=i, color='coral', lw=2) ax3.axvline(x=0, color='k', ls='--', lw=1.5) ax3.set_xlabel('Correlation coefficient') ax3.set_title('Skipped Pearson r = {}\n95% CI = [{}, {}]'.format( r_pearson.round(2), pearson_ci[0], pearson_ci[1]), y=1.05) # Optimize layout plt.tight_layout() return fig
def test_corr(self): """Test function corr Compare to R `correlation` package. See test_correlation.R file. """ np.random.seed(123) mean, cov = [4, 6], [(1, .6), (.6, 1)] x, y = np.random.multivariate_normal(mean, cov, 30).T x2, y2 = x.copy(), y.copy() x[3], y[5] = 12, -8 x2[3], y2[5] = 7, 2.6 # Pearson correlation stats = corr(x, y, method='pearson') assert np.isclose(stats.loc['pearson', 'r'], 0.1761221) assert np.isclose(stats.loc['pearson', 'p-val'], 0.3518659) assert stats.loc['pearson', 'CI95%'][0] == round(-0.1966232, 2) assert stats.loc['pearson', 'CI95%'][1] == round(0.5043872, 2) # - One-sided: greater stats = corr(x, y, method='pearson', alternative='greater') assert np.isclose(stats.loc['pearson', 'r'], 0.1761221) assert np.isclose(stats.loc['pearson', 'p-val'], 0.175933) assert stats.loc['pearson', 'CI95%'][0] == round(-0.1376942, 2) assert stats.loc['pearson', 'CI95%'][1] == 1 # - One-sided: less stats = corr(x, y, method='pearson', alternative='less') assert np.isclose(stats.loc['pearson', 'r'], 0.1761221) assert np.isclose(stats.loc['pearson', 'p-val'], 0.824067) assert stats.loc['pearson', 'CI95%'][0] == -1 assert stats.loc['pearson', 'CI95%'][1] == round(0.4578044, 2) # Spearman correlation stats = corr(x, y, method='spearman') assert np.isclose(stats.loc['spearman', 'r'], 0.4740823) assert np.isclose(stats.loc['spearman', 'p-val'], 0.008129768) # CI are calculated using a different formula for Spearman in R # assert stats.loc['spearman', 'CI95%'][0] == round(0.1262988, 2) # assert stats.loc['spearman', 'CI95%'][1] == round(0.7180799, 2) # Kendall correlation # R uses a different estimation method than scipy for the p-value stats = corr(x, y, method='kendall') assert np.isclose(stats.loc['kendall', 'r'], 0.3517241) # Skipped correlation -- compare with robust corr toolbox # https://sourceforge.net/projects/robustcorrtool/ stats = corr(x, y, method='skipped') assert round(stats.loc['skipped', 'r'], 4) == 0.5123 assert stats.loc['skipped', 'outliers'] == 2 sk_sp = corr(x2, y2, method='skipped') assert round(sk_sp.loc['skipped', 'r'], 4) == 0.5123 assert sk_sp.loc['skipped', 'outliers'] == 2 # Pearson skipped correlation sk_pe = corr(x2, y2, method='skipped', corr_type='pearson') assert np.round(sk_pe.loc['skipped', 'r'], 4) == 0.5254 assert sk_pe.loc['skipped', 'outliers'] == 2 assert not sk_sp.equals(sk_pe) # Shepherd stats = corr(x, y, method='shepherd') assert np.isclose(stats.loc['shepherd', 'r'], 0.5123153) assert np.isclose(stats.loc['shepherd', 'p-val'], 0.005316) assert stats.loc['shepherd', 'outliers'] == 2 _, _, outliers = skipped(x, y, corr_type='pearson') assert outliers.size == x.size assert stats.loc['shepherd', 'n'] == 30 # Percbend -- compare with robust corr toolbox stats = corr(x, y, method='percbend') assert round(stats.loc['percbend', 'r'], 4) == 0.4843 assert np.isclose(stats.loc['percbend', 'r'], 0.4842686) assert np.isclose(stats.loc['percbend', 'p-val'], 0.006693313) stats = corr(x2, y2, method='percbend') assert round(stats.loc['percbend', 'r'], 4) == 0.4843 stats = corr(x, y, method='percbend', beta=.5) assert round(stats.loc['percbend', 'r'], 4) == 0.4848 # Compare biweight correlation to astropy stats = corr(x, y, method='bicor') assert np.isclose(stats.loc['bicor', 'r'], 0.4951418) assert np.isclose(stats.loc['bicor', 'p-val'], 0.005403701) assert stats.loc['bicor', 'CI95%'][0] == round(0.1641553, 2) assert stats.loc['bicor', 'CI95%'][1] == round(0.7259185, 2) stats = corr(x, y, method='bicor', c=5) assert np.isclose(stats.loc['bicor', 'r'], 0.4940706950017) # Not normally distributed z = np.random.uniform(size=30) corr(x, z, method='pearson') # With NaN values x[3] = np.nan corr(x, y) # With the same array # Disabled because of AppVeyor failure # assert corr(x, x).loc['pearson', 'BF10'] == str(np.inf) # Wrong argument with pytest.raises(ValueError): corr(x, y, method='error') with pytest.raises(ValueError): corr(x, y, tail='error') # Compare BF10 with JASP df = read_dataset('pairwise_corr') stats = corr(df['Neuroticism'], df['Extraversion']) assert np.isclose(1 / float(stats['BF10'].to_numpy()), 1.478e-13) # Perfect correlation, CI and power should be 1, BF should be Inf # https://github.com/raphaelvallat/pingouin/issues/195 stats = corr(x, x) assert np.isclose(stats.at['pearson', 'r'], 1) assert np.isclose(stats.at['pearson', 'power'], 1) # When one column is a constant, the correlation is not defined # and Pingouin return a DataFrame full of NaN, except for ``n`` x, y = [1, 1, 1], [1, 2, 3] stats = corr(x, y) assert stats.at['pearson', 'n'] assert np.isnan(stats.at['pearson', 'r']) # Biweight midcorrelation returns NaN when MAD is not defined assert np.isnan(bicor(np.array([1, 1, 1, 1, 0, 1]), np.arange(6))[0])