Example #1
0
 def test_corr(self):
     """Test function corr"""
     np.random.seed(123)
     mean, cov = [4, 6], [(1, .6), (.6, 1)]
     x, y = np.random.multivariate_normal(mean, cov, 30).T
     x[3], y[5] = 12, -8
     corr(x, y, method='pearson', tail='one-sided')
     corr(x, y, method='spearman', tail='two-sided')
     corr(x, y, method='kendall')
     corr(x, y, method='shepherd', tail='two-sided')
     # Compare with robust corr toolbox
     stats = corr(x, y, method='skipped')
     assert np.round(stats['r'].to_numpy(), 3) == 0.512
     assert stats['outliers'].to_numpy() == 2
     # Changing the method using kwargs
     sk_sp = corr(x, y, method='skipped', corr_type='spearman')
     sk_pe = corr(x, y, method='skipped', corr_type='pearson')
     assert not sk_sp.equals(sk_pe)
     stats = corr(x, y, method='shepherd')
     assert stats['outliers'].to_numpy() == 2
     _, _, outliers = skipped(x, y, corr_type='pearson')
     assert outliers.size == x.size
     assert stats['n'].to_numpy() == 30
     stats = corr(x, y, method='percbend')
     assert np.round(stats['r'].to_numpy(), 3) == 0.484
     # Compare biweight correlation to astropy
     stats = corr(x, y, method='bicor')
     assert np.isclose(stats['r'].to_numpy(), 0.4951417784979)
     # Changing the value of C using kwargs
     stats = corr(x, y, method='bicor', c=5)
     assert np.isclose(stats['r'].to_numpy(), 0.4940706950017)
     # Not normally distributed
     z = np.random.uniform(size=30)
     corr(x, z, method='pearson')
     # With NaN values
     x[3] = np.nan
     corr(x, y)
     # With the same array
     # Disabled because of AppVeyor failure
     # assert corr(x, x).loc['pearson', 'BF10'] == str(np.inf)
     # Wrong argument
     with pytest.raises(ValueError):
         corr(x, y, method='error')
     # Compare BF10 with JASP
     df = read_dataset('pairwise_corr')
     stats = corr(df['Neuroticism'], df['Extraversion'])
     assert np.isclose(1 / float(stats['BF10'].to_numpy()), 1.478e-13)
     # When one column is a constant, the correlation is not defined
     # and Pingouin return a DataFrame full of NaN, except for ``n``
     x, y = [1, 1, 1], [1, 2, 3]
     stats = corr(x, y)
     assert stats.at['pearson', 'n']
     assert np.isnan(stats.at['pearson', 'r'])
     # Biweight midcorrelation returns NaN when MAD is not defined
     assert np.isnan(bicor(np.array([1, 1, 1, 1, 0, 1]), np.arange(6))[0])
Example #2
0
 def test_corr(self):
     """Test function corr"""
     np.random.seed(123)
     mean, cov = [4, 6], [(1, .6), (.6, 1)]
     x, y = np.random.multivariate_normal(mean, cov, 30).T
     x[3], y[5] = 12, -8
     corr(x, y, method='pearson', tail='one-sided')
     corr(x, y, method='spearman', tail='two-sided')
     corr(x, y, method='kendall')
     corr(x, y, method='shepherd', tail='two-sided')
     # Compare with robust corr toolbox
     stats = corr(x, y, method='skipped')
     assert stats['r'].values == 0.512
     assert stats['outliers'].values == 2
     stats = corr(x, y, method='shepherd')
     assert stats['outliers'].values == 2
     _, _, outliers = skipped(x, y, method='pearson')
     assert outliers.size == x.size
     assert stats['n'].values == 30
     stats = corr(x, y, method='percbend')
     assert stats['r'].values == 0.484
     # Not normally distributed
     z = np.random.uniform(size=30)
     corr(x, z, method='pearson')
     # With NaN values
     x[3] = np.nan
     corr(x, y)
     # With the same array
     assert float(corr(x, x).loc['pearson', 'BF10']) == np.inf
     # Wrong argument
     with pytest.raises(ValueError):
         corr(x, y, method='error')
     with pytest.raises(ValueError):
         corr(x, y[:-10])
     # Compare with JASP
     df = read_dataset('pairwise_corr')
     stats = corr(df['Neuroticism'], df['Extraversion'])
     assert np.isclose(1 / float(stats['BF10'].values), 1.478e-13)
     # With more than 100 values to see if BF10 is computed
     xx, yy = np.random.multivariate_normal(mean, cov, 1500).T
     c1500 = corr(xx, yy)
     assert 'BF10' not in c1500.keys()
Example #3
0
def plot_skipped_corr(x, y, xlabel=None, ylabel=None, n_boot=2000, seed=None):
    """Plot the bootstrapped 95% confidence intervals and distribution
    of a robust Skipped correlation.

    Parameters
    ----------
    x, y : 1D-arrays or list
        Samples
    xlabel, ylabel : str
        Axes labels
    n_boot : int
        Number of bootstrap iterations for the computation of the
        confidence intervals
    seed : int
        Random seed generator for the bootstrap confidence intervals.

    Returns
    --------
    fig : matplotlib Figure instance
        Matplotlib Figure. To get the individual axes, use fig.axes.

    Notes
    -----
    This function is inspired by the Matlab Robust Correlation Toolbox (Pernet,
    Wilcox and Rousselet, 2012). It uses the skipped correlation to determine
    the outliers. Note that this function requires the scikit-learn package.

    References
    ----------
    .. [1] Pernet, C.R., Wilcox, R., Rousselet, G.A., 2012. Robust correlation
           analyses: false positive and power validation using a new open
           source matlab toolbox. Front. Psychol. 3, 606.
           https://doi.org/10.3389/fpsyg.2012.00606

    Examples
    --------

    Plot a robust Skipped correlation with bootstrapped confidence intervals

    .. plot::

        >>> import numpy as np
        >>> import pingouin as pg
        >>> np.random.seed(123)
        >>> mean, cov, n = [170, 70], [[20, 10], [10, 20]], 30
        >>> x, y = np.random.multivariate_normal(mean, cov, n).T
        >>> # Introduce two outliers
        >>> x[10], y[10] = 160, 100
        >>> x[8], y[8] = 165, 90
        >>> fig = pg.plot_skipped_corr(x, y, xlabel='Height', ylabel='Weight')
    """
    from pingouin.correlation import skipped
    from scipy.stats import pearsonr
    from pingouin.effsize import compute_bootci

    # Safety check
    x = np.asarray(x)
    y = np.asarray(y)
    assert x.size == y.size

    # Skipped Spearman / Pearson correlations
    r, p, outliers = skipped(x, y, method='spearman')
    r_pearson, _ = pearsonr(x[~outliers], y[~outliers])

    # Bootstrapped skipped Spearman distribution & CI
    spearman_ci, spearman_dist = compute_bootci(x=x[~outliers],
                                                y=y[~outliers],
                                                func='spearman',
                                                n_boot=n_boot,
                                                return_dist=True,
                                                seed=seed)

    # Bootstrapped skipped Pearson distribution & CI
    pearson_ci, pearson_dist = compute_bootci(x=x[~outliers],
                                              y=y[~outliers],
                                              func='pearson',
                                              n_boot=n_boot,
                                              return_dist=True,
                                              seed=seed)

    # START THE PLOT
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12, 4.2))
    # plt.subplots_adjust(wspace=0.3)
    sns.despine()

    # Scatter plot and regression lines
    sns.regplot(x[~outliers], y[~outliers], ax=ax1, color='darkcyan')
    ax1.scatter(x[outliers], y[outliers], color='indianred', label='outliers')
    ax1.scatter(x[~outliers], y[~outliers], color='seagreen', label='good')

    # Labels
    xlabel = 'x' if xlabel is None else xlabel
    ylabel = 'y' if ylabel is None else ylabel
    ax1.set_xlabel(xlabel)
    ax1.set_ylabel(ylabel)
    ax1.set_title('Outliers (n={})'.format(sum(outliers)), y=1.05)

    # Spearman distribution
    sns.distplot(spearman_dist, kde=True, ax=ax2, color='darkcyan')
    for i in spearman_ci:
        ax2.axvline(x=i, color='coral', lw=2)
    ax2.axvline(x=0, color='k', ls='--', lw=1.5)
    ax2.set_ylabel('Density of bootstrap samples')
    ax2.set_xlabel('Correlation coefficient')
    ax2.set_title('Skipped Spearman r = {}\n95% CI = [{}, {}]'.format(
        r.round(2), spearman_ci[0], spearman_ci[1]),
                  y=1.05)

    # Pearson dististribution
    sns.distplot(pearson_dist, kde=True, ax=ax3, color='steelblue')
    for i in pearson_ci:
        ax3.axvline(x=i, color='coral', lw=2)
    ax3.axvline(x=0, color='k', ls='--', lw=1.5)
    ax3.set_xlabel('Correlation coefficient')
    ax3.set_title('Skipped Pearson r = {}\n95% CI = [{}, {}]'.format(
        r_pearson.round(2), pearson_ci[0], pearson_ci[1]),
                  y=1.05)

    # Optimize layout
    plt.tight_layout()

    return fig
Example #4
0
    def test_corr(self):
        """Test function corr

        Compare to R `correlation` package. See test_correlation.R file.
        """
        np.random.seed(123)
        mean, cov = [4, 6], [(1, .6), (.6, 1)]
        x, y = np.random.multivariate_normal(mean, cov, 30).T
        x2, y2 = x.copy(), y.copy()
        x[3], y[5] = 12, -8
        x2[3], y2[5] = 7, 2.6

        # Pearson correlation
        stats = corr(x, y, method='pearson')
        assert np.isclose(stats.loc['pearson', 'r'], 0.1761221)
        assert np.isclose(stats.loc['pearson', 'p-val'], 0.3518659)
        assert stats.loc['pearson', 'CI95%'][0] == round(-0.1966232, 2)
        assert stats.loc['pearson', 'CI95%'][1] == round(0.5043872, 2)
        # - One-sided: greater
        stats = corr(x, y, method='pearson', alternative='greater')
        assert np.isclose(stats.loc['pearson', 'r'], 0.1761221)
        assert np.isclose(stats.loc['pearson', 'p-val'], 0.175933)
        assert stats.loc['pearson', 'CI95%'][0] == round(-0.1376942, 2)
        assert stats.loc['pearson', 'CI95%'][1] == 1
        # - One-sided: less
        stats = corr(x, y, method='pearson', alternative='less')
        assert np.isclose(stats.loc['pearson', 'r'], 0.1761221)
        assert np.isclose(stats.loc['pearson', 'p-val'], 0.824067)
        assert stats.loc['pearson', 'CI95%'][0] == -1
        assert stats.loc['pearson', 'CI95%'][1] == round(0.4578044, 2)

        # Spearman correlation
        stats = corr(x, y, method='spearman')
        assert np.isclose(stats.loc['spearman', 'r'], 0.4740823)
        assert np.isclose(stats.loc['spearman', 'p-val'], 0.008129768)
        # CI are calculated using a different formula for Spearman in R
        # assert stats.loc['spearman', 'CI95%'][0] == round(0.1262988, 2)
        # assert stats.loc['spearman', 'CI95%'][1] == round(0.7180799, 2)

        # Kendall correlation
        # R uses a different estimation method than scipy for the p-value
        stats = corr(x, y, method='kendall')
        assert np.isclose(stats.loc['kendall', 'r'], 0.3517241)
        # Skipped correlation -- compare with robust corr toolbox
        # https://sourceforge.net/projects/robustcorrtool/
        stats = corr(x, y, method='skipped')
        assert round(stats.loc['skipped', 'r'], 4) == 0.5123
        assert stats.loc['skipped', 'outliers'] == 2
        sk_sp = corr(x2, y2, method='skipped')
        assert round(sk_sp.loc['skipped', 'r'], 4) == 0.5123
        assert sk_sp.loc['skipped', 'outliers'] == 2
        # Pearson skipped correlation
        sk_pe = corr(x2, y2, method='skipped', corr_type='pearson')
        assert np.round(sk_pe.loc['skipped', 'r'], 4) == 0.5254
        assert sk_pe.loc['skipped', 'outliers'] == 2
        assert not sk_sp.equals(sk_pe)
        # Shepherd
        stats = corr(x, y, method='shepherd')
        assert np.isclose(stats.loc['shepherd', 'r'], 0.5123153)
        assert np.isclose(stats.loc['shepherd', 'p-val'], 0.005316)
        assert stats.loc['shepherd', 'outliers'] == 2
        _, _, outliers = skipped(x, y, corr_type='pearson')
        assert outliers.size == x.size
        assert stats.loc['shepherd', 'n'] == 30
        # Percbend -- compare with robust corr toolbox
        stats = corr(x, y, method='percbend')
        assert round(stats.loc['percbend', 'r'], 4) == 0.4843
        assert np.isclose(stats.loc['percbend', 'r'], 0.4842686)
        assert np.isclose(stats.loc['percbend', 'p-val'], 0.006693313)
        stats = corr(x2, y2, method='percbend')
        assert round(stats.loc['percbend', 'r'], 4) == 0.4843
        stats = corr(x, y, method='percbend', beta=.5)
        assert round(stats.loc['percbend', 'r'], 4) == 0.4848
        # Compare biweight correlation to astropy
        stats = corr(x, y, method='bicor')
        assert np.isclose(stats.loc['bicor', 'r'], 0.4951418)
        assert np.isclose(stats.loc['bicor', 'p-val'], 0.005403701)
        assert stats.loc['bicor', 'CI95%'][0] == round(0.1641553, 2)
        assert stats.loc['bicor', 'CI95%'][1] == round(0.7259185, 2)
        stats = corr(x, y, method='bicor', c=5)
        assert np.isclose(stats.loc['bicor', 'r'], 0.4940706950017)
        # Not normally distributed
        z = np.random.uniform(size=30)
        corr(x, z, method='pearson')
        # With NaN values
        x[3] = np.nan
        corr(x, y)
        # With the same array
        # Disabled because of AppVeyor failure
        # assert corr(x, x).loc['pearson', 'BF10'] == str(np.inf)
        # Wrong argument
        with pytest.raises(ValueError):
            corr(x, y, method='error')
        with pytest.raises(ValueError):
            corr(x, y, tail='error')
        # Compare BF10 with JASP
        df = read_dataset('pairwise_corr')
        stats = corr(df['Neuroticism'], df['Extraversion'])
        assert np.isclose(1 / float(stats['BF10'].to_numpy()), 1.478e-13)
        # Perfect correlation, CI and power should be 1, BF should be Inf
        # https://github.com/raphaelvallat/pingouin/issues/195
        stats = corr(x, x)
        assert np.isclose(stats.at['pearson', 'r'], 1)
        assert np.isclose(stats.at['pearson', 'power'], 1)
        # When one column is a constant, the correlation is not defined
        # and Pingouin return a DataFrame full of NaN, except for ``n``
        x, y = [1, 1, 1], [1, 2, 3]
        stats = corr(x, y)
        assert stats.at['pearson', 'n']
        assert np.isnan(stats.at['pearson', 'r'])
        # Biweight midcorrelation returns NaN when MAD is not defined
        assert np.isnan(bicor(np.array([1, 1, 1, 1, 0, 1]), np.arange(6))[0])