def test_compute_boot_esci(self): """Test function compute_bootci Compare with Matlab bootci function """ # This is the `lawdata` dataset in Matlab # >>> load lawdata # >>> x_m = gpa; # >>> y_m = lsat; x_m = [3.39, 3.3, 2.81, 3.03, 3.44, 3.07, 3.0, 3.43, 3.36, 3.13, 3.12, 2.74, 2.76, 2.88, 2.96] y_m = [576, 635, 558, 578, 666, 580, 555, 661, 651, 605, 653, 575, 545, 572, 594] # 1. bootci around a pearson correlation coefficient # Matlab: bootci(n_boot, {@corr, x_m, y_m}, 'type', 'norm'); ci = compute_bootci(x_m, y_m, method='norm', seed=123) assert ci[0] == 0.52 and ci[1] == 1.05 ci = compute_bootci(x_m, y_m, method='per', seed=123) assert ci[0] == 0.45 and ci[1] == 0.96 ci = compute_bootci(x_m, y_m, method='cper', seed=123) assert ci[0] == 0.39 and ci[1] == 0.95 # 2. Univariate function: mean ci_n = compute_bootci(x_m, func='mean', method='norm', seed=42) ci_p = compute_bootci(x_m, func='mean', method='per', seed=42) ci_c = compute_bootci(x_m, func='mean', method='cper', seed=42) assert ci_n[0] == 2.98 and ci_n[1] == 3.21 assert ci_p[0] == 2.98 and ci_p[1] == 3.21 assert ci_c[0] == 2.98 and round(ci_c[1], 1) == 3.2 # 3. Univariate custom function: skewness from scipy.stats import skew n_boot = 10000 ci_n = compute_bootci(x_m, func=skew, method='norm', n_boot=n_boot, decimals=1, seed=42) ci_p = compute_bootci(x_m, func=skew, method='per', n_boot=n_boot, decimals=1, seed=42) ci_c = compute_bootci(x_m, func=skew, method='cper', n_boot=n_boot, decimals=1, seed=42) assert ci_n[0] == -0.7 and ci_n[1] == 0.8 assert ci_p[0] == -0.7 and ci_p[1] == 0.8 assert ci_c[0] == -0.7 and ci_c[1] == 0.8 # 4. Bivariate custom function: paired T-test from scipy.stats import ttest_rel ci_n = compute_bootci(x_m, y_m, func=lambda x, y: ttest_rel(x, y)[0], method='norm', n_boot=n_boot, decimals=0, seed=42) ci_p = compute_bootci(x_m, y_m, func=lambda x, y: ttest_rel(x, y)[0], method='per', n_boot=n_boot, decimals=0, seed=42) ci_c = compute_bootci(x_m, y_m, func=lambda x, y: ttest_rel(x, y)[0], method='cper', n_boot=n_boot, decimals=0, seed=42) assert ci_n[0] == -69 and ci_n[1] == -35 assert ci_p[0] == -79 and ci_p[1] == -48 assert ci_c[0] == -68 and ci_c[1] == -47 # 5. Test all combinations from itertools import product methods = ['norm', 'per', 'cper'] funcs = ['spearman', 'pearson', 'cohen', 'hedges'] paired = [True, False] pr = list(product(methods, funcs, paired)) for m, f, p in pr: compute_bootci(x, y, func=f, method=m, seed=123, n_boot=100) # Now the univariate function funcs = ['mean', 'std', 'var'] for m, f in list(product(methods, funcs)): compute_bootci(x, func=f, method=m, seed=123, n_boot=100) with pytest.raises(ValueError): compute_bootci(x, y, func='wrong') # Using a custom function compute_bootci(x, y, func=lambda x, y: np.sum(np.exp(x) / np.exp(y)), n_boot=10000, decimals=4, confidence=.68, seed=None) # Get the bootstrapped distribution _, bdist = compute_bootci(x, y, return_dist=True, n_boot=1500) assert bdist.size == 1500
def plot_skipped_corr(x, y, xlabel=None, ylabel=None, n_boot=2000, seed=None): """Plot the bootstrapped 95% confidence intervals and distribution of a robust Skipped correlation. Parameters ---------- x, y : 1D-arrays or list Samples xlabel, ylabel : str Axes labels n_boot : int Number of bootstrap iterations for the computation of the confidence intervals seed : int Random seed generator for the bootstrap confidence intervals. Returns -------- fig : matplotlib Figure instance Matplotlib Figure. To get the individual axes, use fig.axes. Notes ----- This function is inspired by the Matlab Robust Correlation Toolbox (Pernet, Wilcox and Rousselet, 2012). It uses the skipped correlation to determine the outliers. Note that this function requires the scikit-learn package. References ---------- .. [1] Pernet, C.R., Wilcox, R., Rousselet, G.A., 2012. Robust correlation analyses: false positive and power validation using a new open source matlab toolbox. Front. Psychol. 3, 606. https://doi.org/10.3389/fpsyg.2012.00606 Examples -------- Plot a robust Skipped correlation with bootstrapped confidence intervals .. plot:: >>> import numpy as np >>> import pingouin as pg >>> np.random.seed(123) >>> mean, cov, n = [170, 70], [[20, 10], [10, 20]], 30 >>> x, y = np.random.multivariate_normal(mean, cov, n).T >>> # Introduce two outliers >>> x[10], y[10] = 160, 100 >>> x[8], y[8] = 165, 90 >>> fig = pg.plot_skipped_corr(x, y, xlabel='Height', ylabel='Weight') """ from pingouin.correlation import skipped from scipy.stats import pearsonr from pingouin.effsize import compute_bootci # Safety check x = np.asarray(x) y = np.asarray(y) assert x.size == y.size # Skipped Spearman / Pearson correlations r, p, outliers = skipped(x, y, method='spearman') r_pearson, _ = pearsonr(x[~outliers], y[~outliers]) # Bootstrapped skipped Spearman distribution & CI spearman_ci, spearman_dist = compute_bootci(x=x[~outliers], y=y[~outliers], func='spearman', n_boot=n_boot, return_dist=True, seed=seed) # Bootstrapped skipped Pearson distribution & CI pearson_ci, pearson_dist = compute_bootci(x=x[~outliers], y=y[~outliers], func='pearson', n_boot=n_boot, return_dist=True, seed=seed) # START THE PLOT fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12, 4.2)) # plt.subplots_adjust(wspace=0.3) sns.despine() # Scatter plot and regression lines sns.regplot(x[~outliers], y[~outliers], ax=ax1, color='darkcyan') ax1.scatter(x[outliers], y[outliers], color='indianred', label='outliers') ax1.scatter(x[~outliers], y[~outliers], color='seagreen', label='good') # Labels xlabel = 'x' if xlabel is None else xlabel ylabel = 'y' if ylabel is None else ylabel ax1.set_xlabel(xlabel) ax1.set_ylabel(ylabel) ax1.set_title('Outliers (n={})'.format(sum(outliers)), y=1.05) # Spearman distribution sns.distplot(spearman_dist, kde=True, ax=ax2, color='darkcyan') for i in spearman_ci: ax2.axvline(x=i, color='coral', lw=2) ax2.axvline(x=0, color='k', ls='--', lw=1.5) ax2.set_ylabel('Density of bootstrap samples') ax2.set_xlabel('Correlation coefficient') ax2.set_title('Skipped Spearman r = {}\n95% CI = [{}, {}]'.format( r.round(2), spearman_ci[0], spearman_ci[1]), y=1.05) # Pearson dististribution sns.distplot(pearson_dist, kde=True, ax=ax3, color='steelblue') for i in pearson_ci: ax3.axvline(x=i, color='coral', lw=2) ax3.axvline(x=0, color='k', ls='--', lw=1.5) ax3.set_xlabel('Correlation coefficient') ax3.set_title('Skipped Pearson r = {}\n95% CI = [{}, {}]'.format( r_pearson.round(2), pearson_ci[0], pearson_ci[1]), y=1.05) # Optimize layout plt.tight_layout() return fig
def test_compute_boot_esci(self): """Test function compute_bootci""" # Compare with Matlab x_m = [ 3.39, 3.3, 2.81, 3.03, 3.44, 3.07, 3.0, 3.43, 3.36, 3.13, 3.12, 2.74, 2.76, 2.88, 2.96 ] y_m = [ 576, 635, 558, 578, 666, 580, 555, 661, 651, 605, 653, 575, 545, 572, 594 ] ci = compute_bootci(x_m, y_m, method='norm', seed=123, decimals=2) assert ci[0] == 0.52 and ci[1] == 1.05 ci = compute_bootci(x_m, y_m, method='per', seed=123, decimals=2) assert ci[0] == 0.45 and ci[1] == 0.96 ci = compute_bootci(x_m, y_m, method='cper', seed=123, decimals=2) assert ci[0] == 0.39 and ci[1] == 0.95 # Test all combinations from itertools import product methods = ['norm', 'per', 'cper'] funcs = ['spearman', 'pearson', 'cohen', 'hedges'] paired = [True, False] pr = list(product(methods, funcs, paired)) for m, f, p in pr: compute_bootci(x, y, func=f, method=m, seed=123, n_boot=100) # Now the univariate function funcs = ['mean', 'std', 'var'] for m, f in list(product(methods, funcs)): compute_bootci(x, func=f, method=m, seed=123, n_boot=100) with pytest.raises(ValueError): compute_bootci(x, y, func='wrong') # Using a custom function compute_bootci(x, y, func=lambda x, y: np.sum(np.exp(x) / np.exp(y)), n_boot=10000, decimals=4, confidence=.68, seed=None) # Get the bootstrapped distribution _, bdist = compute_bootci(x, y, return_dist=True, n_boot=1500) assert bdist.size == 1500