コード例 #1
0
ファイル: test_utils.py プロジェクト: agamemnonc/pingouin
 def test_flatten_list(self):
     """Test function _flatten_list."""
     x = ['X1', ['M1', 'M2'], 'Y1', ['Y2']]
     fl = _flatten_list(x)
     np.testing.assert_array_equal(fl, ['X1', 'M1', 'M2', 'Y1', 'Y2'])
     x = ['Xaa', 'Xbb', 'Xcc']
     fl = _flatten_list(x)
     np.testing.assert_array_equal(fl, x)
コード例 #2
0
 def test_flatten_list(self):
     """Test function _flatten_list."""
     x = ['X1', ['M1', 'M2'], 'Y1', ['Y2']]
     fl = _flatten_list(x)
     np.testing.assert_array_equal(fl, ['X1', 'M1', 'M2', 'Y1', 'Y2'])
     x = ['Xaa', 'Xbb', 'Xcc']
     np.testing.assert_array_equal(_flatten_list(x), x)
     # With tuples
     xt = ['Xaa', ('Xbb', 'Xcc')]
     fl = _flatten_list(xt)
     assert fl == xt
     np.testing.assert_array_equal(_flatten_list(xt, include_tuple=True), x)
コード例 #3
0
 def test_flatten_list(self):
     """Test function _flatten_list."""
     x = ['X1', ['M1', 'M2'], 'Y1', ['Y2']]
     fl = _flatten_list(x)
     np.testing.assert_array_equal(fl, ['X1', 'M1', 'M2', 'Y1', 'Y2'])
     x = ['Xaa', 'Xbb', 'Xcc']
     np.testing.assert_array_equal(_flatten_list(x), x)
     # With tuples
     xt = ['Xaa', ('Xbb', 'Xcc')]
     fl = _flatten_list(xt)
     assert fl == xt
     np.testing.assert_array_equal(_flatten_list(xt, include_tuple=True), x)
     assert _flatten_list(1) == 1  # x is not iterable
     assert _flatten_list([(1), (2)]) == [1, 2]  # (1) is an int and not tup
コード例 #4
0
ファイル: correlation.py プロジェクト: AnnaTruzzi/pingouin
def mantel_partial_corr(data=None,
                        x=None,
                        y=None,
                        covar=None,
                        x_covar=None,
                        y_covar=None,
                        tail='two-sided',
                        method='pearson',
                        permutations=10000):
    """Partial and semi-partial correlation.

    Parameters
    ----------
    data : pd.DataFrame
        Dataframe. Note that this function can also directly be used as a
        :py:class:`pandas.DataFrame` method, in which case this argument is
        no longer needed.
    x, y : string
        x and y. Must be names of columns in ``data``.
    covar : string or list
        Covariate(s). Must be a names of columns in ``data``. Use a list if
        there are two or more covariates.
    x_covar : string or list
        Covariate(s) for the ``x`` variable. This is used to compute
        semi-partial correlation (i.e. the effect of ``x_covar`` is removed
        from ``x`` but not from ``y``). Note that you cannot specify both
        ``covar`` and ``x_covar``.
    y_covar : string or list
        Covariate(s) for the ``y`` variable. This is used to compute
        semi-partial correlation (i.e. the effect of ``y_covar`` is removed
        from ``y`` but not from ``x``). Note that you cannot specify both
        ``covar`` and ``y_covar``.
    tail : string
        Specify whether to return the 'one-sided' or 'two-sided' p-value.
    method : string
        Specify which method to use for the computation of the correlation
        coefficient. Available methods are ::

        'pearson' : Pearson product-moment correlation
        'spearman' : Spearman rank-order correlation
        'kendall' : Kendall’s tau (ordinal data)
        'percbend' : percentage bend correlation (robust)
        'shepherd' : Shepherd's pi correlation (robust Spearman)
        'skipped' : skipped correlation (robust Spearman, requires sklearn)

    Returns
    -------
    stats : pandas DataFrame
        Test summary ::

        'n' : Sample size (after NaN removal)
        'outliers' : number of outliers (only for 'shepherd' or 'skipped')
        'r' : Correlation coefficient
        'CI95' : 95% parametric confidence intervals
        'r2' : R-squared
        'adj_r2' : Adjusted R-squared
        'p-val' : one or two tailed p-value
        'BF10' : Bayes Factor of the alternative hypothesis (Pearson only)
        'power' : achieved power of the test (= 1 - type II error).

    Notes
    -----
    From [4]_:

    “With *partial correlation*, we find the correlation between :math:`x`
    and :math:`y` holding :math:`C` constant for both :math:`x` and
    :math:`y`. Sometimes, however, we want to hold :math:`C` constant for
    just :math:`x` or just :math:`y`. In that case, we compute a
    *semi-partial correlation*. A partial correlation is computed between
    two residuals. A semi-partial correlation is computed between one
    residual and another raw (or unresidualized) variable.”

    Note that if you are not interested in calculating the statistics and
    p-values but only the partial correlation matrix, a (faster)
    alternative is to use the :py:func:`pingouin.pcorr` method (see example 4).

    Rows with missing values are automatically removed from data. Results have
    been tested against the `ppcor` R package.

    References
    ----------
    .. [1] https://en.wikipedia.org/wiki/Partial_correlation

    .. [2] https://cran.r-project.org/web/packages/ppcor/index.html

    .. [3] https://gist.github.com/fabianp/9396204419c7b638d38f

    .. [4] http://faculty.cas.usf.edu/mbrannick/regression/Partial.html
    """
    from pingouin.utils import _flatten_list
    import skbio
    # Check arguments
    assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.'
    assert data.shape[0] > 2, 'Data must have at least 3 samples.'
    assert isinstance(x, (str, tuple)), 'x must be a string.'
    assert isinstance(y, (str, tuple)), 'y must be a string.'
    assert isinstance(covar, (str, list, type(None)))
    assert isinstance(x_covar, (str, list, type(None)))
    assert isinstance(y_covar, (str, list, type(None)))
    if covar is not None and (x_covar is not None or y_covar is not None):
        raise ValueError('Cannot specify both covar and {x,y}_covar.')
    # Check that columns exist
    col = _flatten_list([x, y, covar, x_covar, y_covar])
    if isinstance(covar, str):
        covar = [covar]
    if isinstance(x_covar, str):
        x_covar = [x_covar]
    if isinstance(y_covar, str):
        y_covar = [y_covar]
    assert all([c in data for c in col]), 'columns are not in dataframe.'
    # Check that columns are numeric
    assert all([data[c].dtype.kind in 'bfi' for c in col])

    # Drop rows with NaN
    data = data[col].dropna()
    assert data.shape[0] > 2, 'Data must have at least 3 non-NAN samples.'

    # Standardize (= no need for an intercept in least-square regression)
    C = (data[col] - data[col].mean(axis=0)) / data[col].std(axis=0)

    if covar is not None:
        # PARTIAL CORRELATION
        cvar = np.atleast_2d(C[covar].values)
        beta_x = np.linalg.lstsq(cvar, C[x].values, rcond=None)[0]
        beta_y = np.linalg.lstsq(cvar, C[y].values, rcond=None)[0]
        res_x = C[x].values - np.dot(cvar, beta_x)
        res_y = C[y].values - np.dot(cvar, beta_y)
    else:
        # SEMI-PARTIAL CORRELATION
        # Initialize "fake" residuals
        res_x, res_y = data[x].values, data[y].values
        if x_covar is not None:
            cvar = np.atleast_2d(C[x_covar].values)
            beta_x = np.linalg.lstsq(cvar, C[x].values, rcond=None)[0]
            res_x = C[x].values - np.dot(cvar, beta_x)
        if y_covar is not None:
            cvar = np.atleast_2d(C[y_covar].values)
            beta_y = np.linalg.lstsq(cvar, C[y].values, rcond=None)[0]
            res_y = C[y].values - np.dot(cvar, beta_y)
    res_x = squareform(res_x)
    res_y = squareform(res_y)
    return skbio.stats.distance.mantel(res_x,
                                       res_y,
                                       method=method,
                                       permutations=10000,
                                       strict=False)
コード例 #5
0
ファイル: correlation.py プロジェクト: AnnaTruzzi/pingouin
def partial_corr(data=None,
                 x=None,
                 y=None,
                 covar=None,
                 x_covar=None,
                 y_covar=None,
                 tail='two-sided',
                 method='pearson'):
    """Partial and semi-partial correlation.

    Parameters
    ----------
    data : pd.DataFrame
        Dataframe. Note that this function can also directly be used as a
        :py:class:`pandas.DataFrame` method, in which case this argument is
        no longer needed.
    x, y : string
        x and y. Must be names of columns in ``data``.
    covar : string or list
        Covariate(s). Must be a names of columns in ``data``. Use a list if
        there are two or more covariates.
    x_covar : string or list
        Covariate(s) for the ``x`` variable. This is used to compute
        semi-partial correlation (i.e. the effect of ``x_covar`` is removed
        from ``x`` but not from ``y``). Note that you cannot specify both
        ``covar`` and ``x_covar``.
    y_covar : string or list
        Covariate(s) for the ``y`` variable. This is used to compute
        semi-partial correlation (i.e. the effect of ``y_covar`` is removed
        from ``y`` but not from ``x``). Note that you cannot specify both
        ``covar`` and ``y_covar``.
    tail : string
        Specify whether to return the 'one-sided' or 'two-sided' p-value.
    method : string
        Specify which method to use for the computation of the correlation
        coefficient. Available methods are ::

        'pearson' : Pearson product-moment correlation
        'spearman' : Spearman rank-order correlation
        'kendall' : Kendall’s tau (ordinal data)
        'percbend' : percentage bend correlation (robust)
        'shepherd' : Shepherd's pi correlation (robust Spearman)
        'skipped' : skipped correlation (robust Spearman, requires sklearn)

    Returns
    -------
    stats : pandas DataFrame
        Test summary ::

        'n' : Sample size (after NaN removal)
        'outliers' : number of outliers (only for 'shepherd' or 'skipped')
        'r' : Correlation coefficient
        'CI95' : 95% parametric confidence intervals
        'r2' : R-squared
        'adj_r2' : Adjusted R-squared
        'p-val' : one or two tailed p-value
        'BF10' : Bayes Factor of the alternative hypothesis (Pearson only)
        'power' : achieved power of the test (= 1 - type II error).

    Notes
    -----
    From [4]_:

    “With *partial correlation*, we find the correlation between :math:`x`
    and :math:`y` holding :math:`C` constant for both :math:`x` and
    :math:`y`. Sometimes, however, we want to hold :math:`C` constant for
    just :math:`x` or just :math:`y`. In that case, we compute a
    *semi-partial correlation*. A partial correlation is computed between
    two residuals. A semi-partial correlation is computed between one
    residual and another raw (or unresidualized) variable.”

    Note that if you are not interested in calculating the statistics and
    p-values but only the partial correlation matrix, a (faster)
    alternative is to use the :py:func:`pingouin.pcorr` method (see example 4).

    Rows with missing values are automatically removed from data. Results have
    been tested against the `ppcor` R package.

    References
    ----------
    .. [1] https://en.wikipedia.org/wiki/Partial_correlation

    .. [2] https://cran.r-project.org/web/packages/ppcor/index.html

    .. [3] https://gist.github.com/fabianp/9396204419c7b638d38f

    .. [4] http://faculty.cas.usf.edu/mbrannick/regression/Partial.html

    Examples
    --------
    1. Partial correlation with one covariate

    >>> import pingouin as pg
    >>> df = pg.read_dataset('partial_corr')
    >>> pg.partial_corr(data=df, x='x', y='y', covar='cv1')
              n      r         CI95%     r2  adj_r2     p-val    BF10  power
    pearson  30  0.568  [0.26, 0.77]  0.323   0.273  0.001055  37.773  0.925

    2. Spearman partial correlation with several covariates

    >>> # Partial correlation of x and y controlling for cv1, cv2 and cv3
    >>> pg.partial_corr(data=df, x='x', y='y', covar=['cv1', 'cv2', 'cv3'],
    ...                 method='spearman')
               n      r         CI95%     r2  adj_r2     p-val  power
    spearman  30  0.491  [0.16, 0.72]  0.242   0.185  0.005817  0.809

    3. As a pandas method

    >>> df.partial_corr(x='x', y='y', covar=['cv1'], method='spearman')
               n      r         CI95%     r2  adj_r2     p-val  power
    spearman  30  0.568  [0.26, 0.77]  0.323   0.273  0.001049  0.925

    4. Partial correlation matrix (returns only the correlation coefficients)

    >>> df.pcorr().round(3)
             x      y    cv1    cv2    cv3
    x    1.000  0.493 -0.095  0.130 -0.385
    y    0.493  1.000 -0.007  0.104 -0.002
    cv1 -0.095 -0.007  1.000 -0.241 -0.470
    cv2  0.130  0.104 -0.241  1.000 -0.118
    cv3 -0.385 -0.002 -0.470 -0.118  1.000

    5. Semi-partial correlation on ``x``

    >>> pg.partial_corr(data=df, x='x', y='y', x_covar=['cv1', 'cv2', 'cv3'])
              n      r         CI95%     r2  adj_r2     p-val   BF10  power
    pearson  30  0.463  [0.12, 0.71]  0.215   0.156  0.009946  5.404  0.752

    6. Semi-partial on both``x`` and ``y`` controlling for different variables

    >>> pg.partial_corr(data=df, x='x', y='y', x_covar='cv1',
    ...                 y_covar=['cv2', 'cv3'], method='spearman')
               n      r         CI95%     r2  adj_r2     p-val  power
    spearman  30  0.429  [0.08, 0.68]  0.184   0.123  0.018092  0.676
    """
    from pingouin.utils import _flatten_list
    # Check arguments
    assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.'
    assert data.shape[0] > 2, 'Data must have at least 3 samples.'
    assert isinstance(x, (str, tuple)), 'x must be a string.'
    assert isinstance(y, (str, tuple)), 'y must be a string.'
    assert isinstance(covar, (str, list, type(None)))
    assert isinstance(x_covar, (str, list, type(None)))
    assert isinstance(y_covar, (str, list, type(None)))
    if covar is not None and (x_covar is not None or y_covar is not None):
        raise ValueError('Cannot specify both covar and {x,y}_covar.')
    # Check that columns exist
    col = _flatten_list([x, y, covar, x_covar, y_covar])
    if isinstance(covar, str):
        covar = [covar]
    if isinstance(x_covar, str):
        x_covar = [x_covar]
    if isinstance(y_covar, str):
        y_covar = [y_covar]
    assert all([c in data for c in col]), 'columns are not in dataframe.'
    # Check that columns are numeric
    assert all([data[c].dtype.kind in 'bfi' for c in col])

    # Drop rows with NaN
    data = data[col].dropna()
    assert data.shape[0] > 2, 'Data must have at least 3 non-NAN samples.'

    # Standardize (= no need for an intercept in least-square regression)
    C = (data[col] - data[col].mean(axis=0)) / data[col].std(axis=0)

    if covar is not None:
        # PARTIAL CORRELATION
        cvar = np.atleast_2d(C[covar].values)
        beta_x = np.linalg.lstsq(cvar, C[x].values, rcond=None)[0]
        beta_y = np.linalg.lstsq(cvar, C[y].values, rcond=None)[0]
        res_x = C[x].values - np.dot(cvar, beta_x)
        res_y = C[y].values - np.dot(cvar, beta_y)
    else:
        # SEMI-PARTIAL CORRELATION
        # Initialize "fake" residuals
        res_x, res_y = data[x].values, data[y].values
        if x_covar is not None:
            cvar = np.atleast_2d(C[x_covar].values)
            beta_x = np.linalg.lstsq(cvar, C[x].values, rcond=None)[0]
            res_x = C[x].values - np.dot(cvar, beta_x)
        if y_covar is not None:
            cvar = np.atleast_2d(C[y_covar].values)
            beta_y = np.linalg.lstsq(cvar, C[y].values, rcond=None)[0]
            res_y = C[y].values - np.dot(cvar, beta_y)
    return corr(res_x, res_y, method=method, tail=tail)
コード例 #6
0
ファイル: correlation.py プロジェクト: snijesh/pingouin
def partial_corr(data=None,
                 x=None,
                 y=None,
                 covar=None,
                 x_covar=None,
                 y_covar=None,
                 tail='two-sided',
                 method='pearson'):
    """Partial and semi-partial correlation.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        Dataframe. Note that this function can also directly be used as a
        :py:class:`pandas.DataFrame` method, in which case this argument is
        no longer needed.
    x, y : string
        x and y. Must be names of columns in ``data``.
    covar : string or list
        Covariate(s). Must be a names of columns in ``data``. Use a list if
        there are two or more covariates.
    x_covar : string or list
        Covariate(s) for the ``x`` variable. This is used to compute
        semi-partial correlation (i.e. the effect of ``x_covar`` is removed
        from ``x`` but not from ``y``). Note that you cannot specify both
        ``covar`` and ``x_covar``.
    y_covar : string or list
        Covariate(s) for the ``y`` variable. This is used to compute
        semi-partial correlation (i.e. the effect of ``y_covar`` is removed
        from ``y`` but not from ``x``). Note that you cannot specify both
        ``covar`` and ``y_covar``.
    tail : string
        Specify whether to return `'one-sided'` or `'two-sided'` p-value.
        Note that the former are simply half the latter.
    method : string
        Correlation type:

        * ``'pearson'``: Pearson :math:`r` product-moment correlation
        * ``'spearman'``: Spearman :math:`\\rho` rank-order correlation
        * ``'kendall'``: Kendall's :math:`\\tau` correlation
          (for ordinal data)
        * ``'bicor'``: Biweight midcorrelation (robust)
        * ``'percbend'``: Percentage bend correlation (robust)
        * ``'shepherd'``: Shepherd's pi correlation (robust)
        * ``'skipped'``: Skipped correlation (robust)

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'n'``: Sample size (after removal of missing values)
        * ``'outliers'``: number of outliers, only if a robust method was used
        * ``'r'``: Correlation coefficient
        * ``'CI95'``: 95% parametric confidence intervals around :math:`r`
        * ``'r2'``: R-squared (:math:`= r^2`)
        * ``'adj_r2'``: Adjusted R-squared
        * ``'p-val'``: tail of the test
        * ``'BF10'``: Bayes Factor of the alternative hypothesis
          (only for Pearson correlation)
        * ``'power'``: achieved power of the test (= 1 - type II error).

    Notes
    -----
    From [1]_:

        *With partial correlation, we find the correlation between x
        and y holding C constant for both x and
        y. Sometimes, however, we want to hold C constant for
        just x or just y. In that case, we compute a
        semi-partial correlation. A partial correlation is computed between
        two residuals. A semi-partial correlation is computed between one
        residual and another raw (or unresidualized) variable.*

    Note that if you are not interested in calculating the statistics and
    p-values but only the partial correlation matrix, a (faster)
    alternative is to use the :py:func:`pingouin.pcorr` method (see example 4).

    Rows with missing values are automatically removed from data. Results have
    been tested against the
    `ppcor <https://cran.r-project.org/web/packages/ppcor/index.html>`_
    R package.

    References
    ----------
    .. [1] http://faculty.cas.usf.edu/mbrannick/regression/Partial.html

    Examples
    --------
    1. Partial correlation with one covariate

    >>> import pingouin as pg
    >>> df = pg.read_dataset('partial_corr')
    >>> pg.partial_corr(data=df, x='x', y='y', covar='cv1').round(3)
              n      r         CI95%     r2  adj_r2  p-val    BF10  power
    pearson  30  0.568  [0.26, 0.77]  0.323   0.273  0.001  37.773  0.925

    2. Spearman partial correlation with several covariates

    >>> # Partial correlation of x and y controlling for cv1, cv2 and cv3
    >>> pg.partial_corr(data=df, x='x', y='y', covar=['cv1', 'cv2', 'cv3'],
    ...                 method='spearman').round(3)
               n      r         CI95%     r2  adj_r2  p-val  power
    spearman  30  0.491  [0.16, 0.72]  0.242   0.185  0.006  0.809

    3. As a pandas method

    >>> df.partial_corr(x='x', y='y', covar=['cv1'],
    ...                 method='spearman').round(3)
               n      r         CI95%     r2  adj_r2  p-val  power
    spearman  30  0.568  [0.26, 0.77]  0.323   0.273  0.001  0.925

    4. Partial correlation matrix (returns only the correlation coefficients)

    >>> df.pcorr().round(3)
             x      y    cv1    cv2    cv3
    x    1.000  0.493 -0.095  0.130 -0.385
    y    0.493  1.000 -0.007  0.104 -0.002
    cv1 -0.095 -0.007  1.000 -0.241 -0.470
    cv2  0.130  0.104 -0.241  1.000 -0.118
    cv3 -0.385 -0.002 -0.470 -0.118  1.000

    5. Semi-partial correlation on x

    >>> pg.partial_corr(data=df, x='x', y='y',
    ...                 x_covar=['cv1', 'cv2', 'cv3']).round(3)
              n      r         CI95%     r2  adj_r2  p-val   BF10  power
    pearson  30  0.463  [0.12, 0.71]  0.215   0.156   0.01  5.404  0.752

    6. Semi-partial on both x and y controlling for different variables

    >>> pg.partial_corr(data=df, x='x', y='y', x_covar='cv1',
    ...                 y_covar=['cv2', 'cv3'], method='spearman').round(3)
               n      r         CI95%     r2  adj_r2  p-val  power
    spearman  30  0.429  [0.08, 0.68]  0.184   0.123  0.018  0.676
    """
    from pingouin.utils import _flatten_list
    assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.'
    assert data.shape[0] > 2, 'Data must have at least 3 samples.'
    assert isinstance(x, (str, tuple)), 'x must be a string.'
    assert isinstance(y, (str, tuple)), 'y must be a string.'
    assert isinstance(covar, (str, list, type(None)))
    assert isinstance(x_covar, (str, list, type(None)))
    assert isinstance(y_covar, (str, list, type(None)))
    if covar is not None and (x_covar is not None or y_covar is not None):
        raise ValueError('Cannot specify both covar and {x,y}_covar.')
    assert x != covar, 'x and covar must be independent'
    assert y != covar, 'y and covar must be independent'
    assert x != y, 'x and y must be independent'
    # Check that columns exist
    col = _flatten_list([x, y, covar, x_covar, y_covar])
    if isinstance(covar, str):
        covar = [covar]
    if isinstance(x_covar, str):
        x_covar = [x_covar]
    if isinstance(y_covar, str):
        y_covar = [y_covar]

    assert all([c in data for c in col]), 'columns are not in dataframe.'
    # Check that columns are numeric
    assert all([data[c].dtype.kind in 'bfiu' for c in col])

    # Drop rows with NaN
    data = data[col].dropna()
    assert data.shape[0] > 2, 'Data must have at least 3 non-NAN samples.'

    # Standardize (= no need for an intercept in least-square regression)
    C = (data[col] - data[col].mean(axis=0)) / data[col].std(axis=0)

    if covar is not None:
        # PARTIAL CORRELATION
        cvar = np.atleast_2d(C[covar].to_numpy())
        beta_x = np.linalg.lstsq(cvar, C[x].to_numpy(), rcond=None)[0]
        beta_y = np.linalg.lstsq(cvar, C[y].to_numpy(), rcond=None)[0]
        res_x = C[x].to_numpy() - cvar @ beta_x
        res_y = C[y].to_numpy() - cvar @ beta_y
    else:
        # SEMI-PARTIAL CORRELATION
        # Initialize "fake" residuals
        res_x, res_y = data[x].to_numpy(), data[y].to_numpy()
        if x_covar is not None:
            cvar = np.atleast_2d(C[x_covar].to_numpy())
            beta_x = np.linalg.lstsq(cvar, C[x].to_numpy(), rcond=None)[0]
            res_x = C[x].to_numpy() - cvar @ beta_x
        if y_covar is not None:
            cvar = np.atleast_2d(C[y_covar].to_numpy())
            beta_y = np.linalg.lstsq(cvar, C[y].to_numpy(), rcond=None)[0]
            res_y = C[y].to_numpy() - cvar @ beta_y
    return corr(res_x, res_y, method=method, tail=tail)
コード例 #7
0
ファイル: pairwise.py プロジェクト: jjwelton187/pingouin
def pairwise_ttests(data=None,
                    dv=None,
                    between=None,
                    within=None,
                    subject=None,
                    parametric=True,
                    alpha=.05,
                    tail='two-sided',
                    padjust='none',
                    effsize='hedges',
                    nan_policy='listwise',
                    return_desc=False,
                    interaction=True,
                    export_filename=None):
    '''Pairwise T-tests.

    Parameters
    ----------
    data : pandas DataFrame
        DataFrame. Note that this function can also directly be used as a
        Pandas method, in which case this argument is no longer needed.
    dv : string
        Name of column containing the dependant variable.
    between : string or list with 2 elements
        Name of column(s) containing the between factor(s).
    within : string or list with 2 elements
        Name of column(s) containing the within factor(s).
    subject : string
        Name of column containing the subject identifier. Compulsory for
        contrast including a within-subject factor.
    parametric : boolean
        If True (default), use the parametric :py:func:`ttest` function.
        If False, use :py:func:`pingouin.wilcoxon` or :py:func:`pingouin.mwu`
        for paired or unpaired samples, respectively.
    alpha : float
        Significance level
    tail : string
        Specify whether the alternative hypothesis is `'two-sided'` or
        `'one-sided'`. Can also be `'greater'` or `'less'` to specify the
        direction of the test. `'greater'` tests the alternative that ``x``
        has a larger mean than ``y``. If tail is `'one-sided'`, Pingouin will
        automatically infer the one-sided alternative hypothesis of the test
        based on the test statistic.
    padjust : string
        Method used for testing and adjustment of pvalues.
        Available methods are ::

        'none' : no correction
        'bonf' : one-step Bonferroni correction
        'sidak' : one-step Sidak correction
        'holm' : step-down method using Bonferroni adjustments
        'fdr_bh' : Benjamini/Hochberg FDR correction
        'fdr_by' : Benjamini/Yekutieli FDR correction
    effsize : string or None
        Effect size type. Available methods are ::

        'none' : no effect size
        'cohen' : Unbiased Cohen d
        'hedges' : Hedges g
        'glass': Glass delta
        'r' : Pearson correlation coefficient
        'eta-square' : Eta-square
        'odds-ratio' : Odds ratio
        'AUC' : Area Under the Curve
        'CLES' : Common Language Effect Size
    nan_policy : string
        Can be `'listwise'` for listwise deletion of missing values in repeated
        measures design (= complete-case analysis) or `'pairwise'` for the
        more liberal pairwise deletion (= available-case analysis).

        .. versionadded:: 0.2.9
    return_desc : boolean
        If True, append group means and std to the output dataframe
    interaction : boolean
        If there are multiple factors and ``interaction`` is True (default),
        Pingouin will also calculate T-tests for the interaction term (see
        Notes).

        .. versionadded:: 0.2.9
    export_filename : string
        Filename (without extension) for the output file.
        If None, do not export the table.
        By default, the file will be created in the current python console
        directory. To change that, specify the filename with full path.

    Returns
    -------
    stats : DataFrame
        Stats summary ::

        'A' : Name of first measurement
        'B' : Name of second measurement
        'Paired' : indicates whether the two measurements are paired or not
        'Parametric' : indicates if (non)-parametric tests were used
        'Tail' : indicate whether the p-values are one-sided or two-sided
        'T' : T statistic (only if parametric=True)
        'U-val' : Mann-Whitney U stat (if parametric=False and unpaired data)
        'W-val' : Wilcoxon W stat (if parametric=False and paired data)
        'dof' : degrees of freedom (only if parametric=True)
        'p-unc' : Uncorrected p-values
        'p-corr' : Corrected p-values
        'p-adjust' : p-values correction method
        'BF10' : Bayes Factor
        'hedges' : effect size (or any effect size defined in ``effsize``)

    See also
    --------
    ttest, mwu, wilcoxon, compute_effsize, multicomp

    Notes
    -----
    Data are expected to be in long-format. If your data is in wide-format,
    you can use the :py:func:`pandas.melt` function to convert from wide to
    long format.

    If ``between`` or ``within`` is a list (e.g. ['col1', 'col2']),
    the function returns 1) the pairwise T-tests between each values of the
    first column, 2) the pairwise T-tests between each values of the second
    column and 3) the interaction between col1 and col2. The interaction is
    dependent of the order of the list, so ['col1', 'col2'] will not yield the
    same results as ['col2', 'col1'], and will only be calculated if
    ``interaction=True``.

    In other words, if ``between`` is a list with two elements, the output
    model is between1 + between2 + between1 * between2.

    Similarly, if `within`` is a list with two elements, the output model is
    within1 + within2 + within1 * within2.

    If both ``between`` and ``within`` are specified, the function return
    within + between + within * between.

    Missing values in repeated measurements are automatically removed using a
    listwise (default) or pairwise deletion strategy. However, you should be
    very careful since it can result in undesired values removal (especially
    for the interaction effect). We strongly recommend that you preprocess
    your data and remove the missing values before using this function.

    This function has been tested against the `pairwise.t.test` R function.

    Examples
    --------
    1. One between-factor

    >>> from pingouin import pairwise_ttests, read_dataset
    >>> df = read_dataset('mixed_anova.csv')
    >>> post_hocs = pairwise_ttests(dv='Scores', between='Group', data=df)

    2. One within-factor

    >>> post_hocs = pairwise_ttests(dv='Scores', within='Time',
    ...                             subject='Subject', data=df)
    >>> print(post_hocs)  # doctest: +SKIP

    3. Non-parametric pairwise paired test (wilcoxon)

    >>> pairwise_ttests(dv='Scores', within='Time', subject='Subject',
    ...                 data=df, parametric=False)  # doctest: +SKIP

    4. Within + Between + Within * Between with corrected p-values

    >>> posthocs = pairwise_ttests(dv='Scores', within='Time',
    ...                            subject='Subject', between='Group',
    ...                            padjust='bonf', data=df)

    5. Between1 + Between2 + Between1 * Between2

    >>> posthocs = pairwise_ttests(dv='Scores', between=['Group', 'Time'],
    ...                            data=df)

    6. Between1 + Between2, no interaction

    >>> posthocs = df.pairwise_ttests(dv='Scores', between=['Group', 'Time'],
    ...                               interaction=False)
    '''
    from .parametric import ttest
    from .nonparametric import wilcoxon, mwu

    # Safety checks
    _check_dataframe(dv=dv,
                     between=between,
                     within=within,
                     subject=subject,
                     effects='all',
                     data=data)

    assert tail in ['one-sided', 'two-sided', 'greater', 'less']
    assert isinstance(alpha, float), 'alpha must be float.'
    assert nan_policy in ['listwise', 'pairwise']

    # Check if we have multiple between or within factors
    multiple_between = False
    multiple_within = False
    contrast = None

    if isinstance(between, list):
        if len(between) > 1:
            multiple_between = True
            contrast = 'multiple_between'
            assert all([b in data.keys() for b in between])
        else:
            between = between[0]

    if isinstance(within, list):
        if len(within) > 1:
            multiple_within = True
            contrast = 'multiple_within'
            assert all([w in data.keys() for w in within])
        else:
            within = within[0]

    if all([multiple_within, multiple_between]):
        raise ValueError("Multiple between and within factors are",
                         "currently not supported. Please select only one.")

    # Check the other cases
    if isinstance(between, str) and within is None:
        contrast = 'simple_between'
        assert between in data.keys()
    if isinstance(within, str) and between is None:
        contrast = 'simple_within'
        assert within in data.keys()
    if isinstance(between, str) and isinstance(within, str):
        contrast = 'within_between'
        assert all([between in data.keys(), within in data.keys()])

    # Reorganize column order
    col_order = [
        'Contrast', 'Time', 'A', 'B', 'mean(A)', 'std(A)', 'mean(B)', 'std(B)',
        'Paired', 'Parametric', 'T', 'U-val', 'W-val', 'dof', 'Tail', 'p-unc',
        'p-corr', 'p-adjust', 'BF10', effsize
    ]

    if contrast in ['simple_within', 'simple_between']:
        # OPTION A: SIMPLE MAIN EFFECTS, WITHIN OR BETWEEN
        paired = True if contrast == 'simple_within' else False
        col = within if contrast == 'simple_within' else between
        # Remove NAN in repeated measurements
        if contrast == 'simple_within' and data[dv].isnull().values.any():
            # Only if nan_policy == 'listwise'. For pairwise deletion,
            # missing values will be removed directly in the lower-level
            # functions (e.g. pg.ttest)
            if nan_policy == 'listwise':
                data = remove_rm_na(dv=dv,
                                    within=within,
                                    subject=subject,
                                    data=data)
            else:
                # The `remove_rm_na` also aggregate other repeated measures
                # factor using the mean. Here, we ensure this behavior too.
                data = data.groupby([subject, within])[dv].mean().reset_index()
            # Now we check that subjects are present in all conditions
            # For example, if we have four subjects and 3 conditions,
            # and if subject 2 have missing data at the third condition,
            # we still need a row with missing values for this subject.
            if data.groupby(within)[subject].count().nunique() != 1:
                raise ValueError("Repeated measures dataframe is not balanced."
                                 " `Subjects` must have the same number of "
                                 "elements in all conditions, "
                                 "even when missing values are present.")

        # Extract effects
        grp_col = data.groupby(col, sort=False)[dv]
        labels = grp_col.groups.keys()
        # Number and labels of possible comparisons
        if len(labels) >= 2:
            combs = list(combinations(labels, 2))
            combs = np.array(combs)
            A = combs[:, 0]
            B = combs[:, 1]
        else:
            raise ValueError('Columns must have at least two unique values.')

        # Initialize dataframe
        stats = pd.DataFrame(dtype=np.float64,
                             index=range(len(combs)),
                             columns=col_order)

        # Force dtype conversion
        cols_str = ['Contrast', 'Time', 'A', 'B', 'Tail', 'p-adjust', 'BF10']
        cols_bool = ['Parametric', 'Paired']
        stats[cols_str] = stats[cols_str].astype(object)
        stats[cols_bool] = stats[cols_bool].astype(bool)

        # Fill str columns
        stats.loc[:, 'A'] = A
        stats.loc[:, 'B'] = B
        stats.loc[:, 'Contrast'] = col
        stats.loc[:, 'Tail'] = tail
        stats.loc[:, 'Paired'] = paired

        for i in range(stats.shape[0]):
            col1, col2 = stats.at[i, 'A'], stats.at[i, 'B']
            x = grp_col.get_group(col1).to_numpy(dtype=np.float64)
            y = grp_col.get_group(col2).to_numpy(dtype=np.float64)
            if parametric:
                stat_name = 'T'
                df_ttest = ttest(x, y, paired=paired, tail=tail)
                stats.at[i, 'BF10'] = df_ttest.at['T-test', 'BF10']
                stats.at[i, 'dof'] = df_ttest.at['T-test', 'dof']
            else:
                if paired:
                    stat_name = 'W-val'
                    df_ttest = wilcoxon(x, y, tail=tail)
                else:
                    stat_name = 'U-val'
                    df_ttest = mwu(x, y, tail=tail)

            # Compute Hedges / Cohen
            ef = np.round(
                compute_effsize(x=x, y=y, eftype=effsize, paired=paired), 3)

            if return_desc:
                stats.at[i, 'mean(A)'] = np.round(np.nanmean(x), 3)
                stats.at[i, 'mean(B)'] = np.round(np.nanmean(y), 3)
                stats.at[i, 'std(A)'] = np.round(np.nanstd(x), 3)
                stats.at[i, 'std(B)'] = np.round(np.nanstd(y), 3)
            stats.at[i, stat_name] = df_ttest[stat_name].iat[0]
            stats.at[i, 'p-unc'] = df_ttest['p-val'].iat[0]
            stats.at[i, effsize] = ef

        # Multiple comparisons
        padjust = None if stats['p-unc'].size <= 1 else padjust
        if padjust is not None:
            if padjust.lower() != 'none':
                _, stats['p-corr'] = multicomp(stats['p-unc'].values,
                                               alpha=alpha,
                                               method=padjust)
                stats['p-adjust'] = padjust
        else:
            stats['p-corr'] = None
            stats['p-adjust'] = None
    else:
        # B1: BETWEEN1 + BETWEEN2 + BETWEEN1 * BETWEEN2
        # B2: WITHIN1 + WITHIN2 + WITHIN1 * WITHIN2
        # B3: WITHIN + BETWEEN + WITHIN * BETWEEN
        if contrast == 'multiple_between':
            # B1
            factors = between
            fbt = factors
            fwt = [None, None]
            # eft = ['between', 'between']
            paired = False
        elif contrast == 'multiple_within':
            # B2
            factors = within
            fbt = [None, None]
            fwt = factors
            # eft = ['within', 'within']
            paired = True
        else:
            # B3
            factors = [within, between]
            fbt = [None, between]
            fwt = [within, None]
            # eft = ['within', 'between']
            paired = False

        stats = pd.DataFrame()
        for i, f in enumerate(factors):
            stats = stats.append(pairwise_ttests(dv=dv,
                                                 between=fbt[i],
                                                 within=fwt[i],
                                                 subject=subject,
                                                 data=data,
                                                 parametric=parametric,
                                                 alpha=alpha,
                                                 tail=tail,
                                                 padjust=padjust,
                                                 effsize=effsize,
                                                 return_desc=return_desc),
                                 ignore_index=True,
                                 sort=False)

        # Then compute the interaction between the factors
        if interaction:
            nrows = stats.shape[0]
            grp_fac1 = data.groupby(factors[0], sort=False)[dv]
            grp_fac2 = data.groupby(factors[1], sort=False)[dv]
            grp_both = data.groupby(factors, sort=False)[dv]
            labels_fac1 = grp_fac1.groups.keys()
            labels_fac2 = grp_fac2.groups.keys()
            # comb_fac1 = list(combinations(labels_fac1, 2))
            comb_fac2 = list(combinations(labels_fac2, 2))

            # Pairwise comparisons
            combs_list = list(product(labels_fac1, comb_fac2))
            ncombs = len(combs_list)
            # np.array(combs_list) does not work because of tuples
            # we therefore need to flatten the tupple
            combs = np.zeros(shape=(ncombs, 3), dtype=object)
            for i in range(ncombs):
                combs[i] = _flatten_list(combs_list[i], include_tuple=True)

            # Append empty rows
            idxiter = np.arange(nrows, nrows + ncombs)
            stats = stats.append(pd.DataFrame(columns=stats.columns,
                                              index=idxiter),
                                 ignore_index=True)
            # Update other columns
            stats.loc[idxiter, 'Contrast'] = factors[0] + ' * ' + factors[1]
            stats.loc[idxiter, 'Time'] = combs[:, 0]
            stats.loc[idxiter, 'Paired'] = paired
            stats.loc[idxiter, 'Tail'] = tail
            stats.loc[idxiter, 'A'] = combs[:, 1]
            stats.loc[idxiter, 'B'] = combs[:, 2]

            for i, comb in enumerate(combs):
                ic = nrows + i  # Take into account previous rows
                fac1, col1, col2 = comb
                x = grp_both.get_group((fac1, col1)).to_numpy(dtype=np.float64)
                y = grp_both.get_group((fac1, col2)).to_numpy(dtype=np.float64)
                ef = np.round(
                    compute_effsize(x=x, y=y, eftype=effsize, paired=paired),
                    3)
                if parametric:
                    stat_name = 'T'
                    df_ttest = ttest(x, y, paired=paired, tail=tail)
                    stats.at[ic, 'BF10'] = df_ttest.at['T-test', 'BF10']
                    stats.at[ic, 'dof'] = df_ttest.at['T-test', 'dof']
                else:
                    if paired:
                        stat_name = 'W-val'
                        df_ttest = wilcoxon(x, y, tail=tail)
                    else:
                        stat_name = 'U-val'
                        df_ttest = mwu(x, y, tail=tail)

                # Append to stats
                if return_desc:
                    stats.at[ic, 'mean(A)'] = np.round(np.nanmean(x), 3)
                    stats.at[ic, 'mean(B)'] = np.round(np.nanmean(y), 3)
                    stats.at[ic, 'std(A)'] = np.round(np.nanstd(x), 3)
                    stats.at[ic, 'std(B)'] = np.round(np.nanstd(y), 3)
                stats.at[ic, stat_name] = df_ttest[stat_name].iat[0]
                stats.at[ic, 'p-unc'] = df_ttest['p-val'].iat[0]
                stats.at[ic, effsize] = ef

            # Multi-comparison columns
            if padjust is not None and padjust.lower() != 'none':
                _, pcor = multicomp(stats.loc[idxiter, 'p-unc'].values,
                                    alpha=alpha,
                                    method=padjust)
                stats.loc[idxiter, 'p-corr'] = pcor
                stats.loc[idxiter, 'p-adjust'] = padjust

    # ---------------------------------------------------------------------
    # Append parametric columns
    stats.loc[:, 'Parametric'] = parametric

    # Reorder and drop empty columns
    stats = stats[np.array(col_order)[np.isin(col_order, stats.columns)]]
    stats = stats.dropna(how='all', axis=1)

    # Rename Time columns
    if (contrast in ['multiple_within', 'multiple_between', 'within_between']
            and interaction):
        stats['Time'].fillna('-', inplace=True)
        stats.rename(columns={'Time': factors[0]}, inplace=True)

    if export_filename is not None:
        _export_table(stats, export_filename)
    return stats
コード例 #8
0
ファイル: correlation.py プロジェクト: raphaelvallat/pingouin
def partial_corr(data=None,
                 x=None,
                 y=None,
                 covar=None,
                 x_covar=None,
                 y_covar=None,
                 alternative='two-sided',
                 method='pearson'):
    """Partial and semi-partial correlation.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        Pandas Dataframe. Note that this function can also directly be used
        as a :py:class:`pandas.DataFrame` method, in which case this argument
        is no longer needed.
    x, y : string
        x and y. Must be names of columns in ``data``.
    covar : string or list
        Covariate(s). Must be a names of columns in ``data``. Use a list if
        there are two or more covariates.
    x_covar : string or list
        Covariate(s) for the ``x`` variable. This is used to compute
        semi-partial correlation (i.e. the effect of ``x_covar`` is removed
        from ``x`` but not from ``y``). Only one of ``covar``,  ``x_covar`` and
        ``y_covar`` can be specified.
    y_covar : string or list
        Covariate(s) for the ``y`` variable. This is used to compute
        semi-partial correlation (i.e. the effect of ``y_covar`` is removed
        from ``y`` but not from ``x``). Only one of ``covar``,  ``x_covar`` and
        ``y_covar`` can be specified.
    alternative : string
        Defines the alternative hypothesis, or tail of the partial correlation. Must be one of
        "two-sided" (default), "greater" or "less". Both "greater" and "less" return a one-sided
        p-value. "greater" tests against the alternative hypothesis that the partial correlation is
        positive (greater than zero), "less" tests against the hypothesis that the partial
        correlation is negative.
    method : string
        Correlation type:

        * ``'pearson'``: Pearson :math:`r` product-moment correlation
        * ``'spearman'``: Spearman :math:`\\rho` rank-order correlation

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'n'``: Sample size (after removal of missing values)
        * ``'r'``: Partial correlation coefficient
        * ``'CI95'``: 95% parametric confidence intervals around :math:`r`
        * ``'p-val'``: p-value

    See also
    --------
    corr, pcorr, pairwise_corr, rm_corr

    Notes
    -----
    Partial correlation [1]_ measures the degree of association between ``x``
    and ``y``, after removing the effect of one or more controlling variables
    (``covar``, or :math:`Z`). Practically, this is achieved by calculating the
    correlation coefficient between the residuals of two linear regressions:

    .. math:: x \\sim Z, y \\sim Z

    Like the correlation coefficient, the partial correlation
    coefficient takes on a value in the range from –1 to 1, where 1 indicates a
    perfect positive association.

    The semipartial correlation is similar to the partial correlation,
    with the exception that the set of controlling variables is only
    removed for either ``x`` or ``y``, but not both.

    Pingouin uses the method described in [2]_ to calculate the (semi)partial
    correlation coefficients and associated p-values. This method is based on
    the inverse covariance matrix and is significantly faster than the
    traditional regression-based method. Results have been tested against the
    `ppcor <https://cran.r-project.org/web/packages/ppcor/index.html>`_
    R package.

    .. important:: Rows with missing values are automatically removed from
        data.

    References
    ----------
    .. [1] https://en.wikipedia.org/wiki/Partial_correlation

    .. [2] https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4681537/

    Examples
    --------
    1. Partial correlation with one covariate

    >>> import pingouin as pg
    >>> df = pg.read_dataset('partial_corr')
    >>> pg.partial_corr(data=df, x='x', y='y', covar='cv1').round(3)
              n      r         CI95%  p-val
    pearson  30  0.568  [0.25, 0.77]  0.001

    2. Spearman partial correlation with several covariates

    >>> # Partial correlation of x and y controlling for cv1, cv2 and cv3
    >>> pg.partial_corr(data=df, x='x', y='y', covar=['cv1', 'cv2', 'cv3'],
    ...                 method='spearman').round(3)
               n      r         CI95%  p-val
    spearman  30  0.521  [0.18, 0.75]  0.005

    3. Same but one-sided test

    >>> pg.partial_corr(data=df, x='x', y='y', covar=['cv1', 'cv2', 'cv3'],
    ...                 alternative="greater", method='spearman').round(3)
               n      r        CI95%  p-val
    spearman  30  0.521  [0.24, 1.0]  0.003

    >>> pg.partial_corr(data=df, x='x', y='y', covar=['cv1', 'cv2', 'cv3'],
    ...                 alternative="less", method='spearman').round(3)
               n      r         CI95%  p-val
    spearman  30  0.521  [-1.0, 0.72]  0.997

    4. As a pandas method

    >>> df.partial_corr(x='x', y='y', covar=['cv1'], method='spearman').round(3)
               n      r         CI95%  p-val
    spearman  30  0.578  [0.27, 0.78]  0.001

    5. Partial correlation matrix (returns only the correlation coefficients)

    >>> df.pcorr().round(3)
             x      y    cv1    cv2    cv3
    x    1.000  0.493 -0.095  0.130 -0.385
    y    0.493  1.000 -0.007  0.104 -0.002
    cv1 -0.095 -0.007  1.000 -0.241 -0.470
    cv2  0.130  0.104 -0.241  1.000 -0.118
    cv3 -0.385 -0.002 -0.470 -0.118  1.000

    6. Semi-partial correlation on x

    >>> pg.partial_corr(data=df, x='x', y='y', x_covar=['cv1', 'cv2', 'cv3']).round(3)
              n      r        CI95%  p-val
    pearson  30  0.463  [0.1, 0.72]  0.015
    """
    from pingouin.utils import _flatten_list
    # Safety check
    assert alternative in [
        'two-sided', 'greater', 'less'
    ], ("Alternative must be one of 'two-sided' (default), 'greater' or 'less'."
        )
    assert method in [
        'pearson', 'spearman'
    ], ('only "pearson" and "spearman" are supported for partial correlation.')
    assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.'
    assert data.shape[0] > 2, 'Data must have at least 3 samples.'
    if covar is not None and (x_covar is not None or y_covar is not None):
        raise ValueError('Cannot specify both covar and {x,y}_covar.')
    if x_covar is not None and y_covar is not None:
        raise ValueError('Cannot specify both x_covar and y_covar.')
    assert x != covar, 'x and covar must be independent'
    assert y != covar, 'y and covar must be independent'
    assert x != y, 'x and y must be independent'
    if isinstance(covar, list):
        assert x not in covar, 'x and covar must be independent'
        assert y not in covar, 'y and covar must be independent'
    # Check that columns exist
    col = _flatten_list([x, y, covar, x_covar, y_covar])
    assert all([c in data for c in col]), 'columns are not in dataframe.'
    # Check that columns are numeric
    assert all([data[c].dtype.kind in 'bfiu' for c in col])

    # Drop rows with NaN
    data = data[col].dropna()
    n = data.shape[0]  # Number of samples
    k = data.shape[1] - 2  # Number of covariates
    assert n > 2, 'Data must have at least 3 non-NAN samples.'

    # Calculate the partial corrrelation matrix - similar to pingouin.pcorr()
    if method == "spearman":
        # Convert the data to rank, similar to R cov()
        V = data.rank(na_option='keep').cov()
    else:
        V = data.cov()
    Vi = np.linalg.pinv(V, hermitian=True)  # Inverse covariance matrix
    Vi_diag = Vi.diagonal()
    D = np.diag(np.sqrt(1 / Vi_diag))
    pcor = -1 * (D @ Vi @ D)  # Partial correlation matrix

    if covar is not None:
        r = pcor[0, 1]
    else:
        # Semi-partial correlation matrix
        with np.errstate(divide='ignore'):
            spcor = pcor / \
                np.sqrt(np.diag(V))[..., None] / \
                np.sqrt(np.abs(Vi_diag - Vi ** 2 / Vi_diag[..., None])).T
        if y_covar is not None:
            r = spcor[0, 1]  # y_covar is removed from y
        else:
            r = spcor[1, 0]  # x_covar is removed from x

    if np.isnan(r):
        # Correlation failed. Return NaN. When would this happen?
        return pd.DataFrame(
            {
                'n': n,
                'r': np.nan,
                'CI95%': np.nan,
                'p-val': np.nan
            },
            index=[method])

    # Compute the two-sided p-value and confidence intervals
    # https://online.stat.psu.edu/stat505/lesson/6/6.3
    pval = _correl_pvalue(r, n, k, alternative)
    ci = compute_esci(stat=r,
                      nx=(n - k),
                      ny=(n - k),
                      eftype='r',
                      decimals=6,
                      alternative=alternative)

    # Create dictionnary
    stats = {
        'n': n,
        'r': r,
        'CI95%': [ci],
        'p-val': pval,
    }

    # Convert to DataFrame
    stats = pd.DataFrame(stats, index=[method])

    # Define order
    col_keep = ['n', 'r', 'CI95%', 'p-val']
    col_order = [k for k in col_keep if k in stats.keys().tolist()]
    return _postprocess_dataframe(stats)[col_order]
コード例 #9
0
def part_corr(data=None,
              x=None,
              y=None,
              covar=None,
              x_covar=None,
              y_covar=None,
              tail='two-sided',
              method='pearson'):
    from pingouin.utils import _flatten_list
    assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.'
    assert data.shape[0] > 2, 'Data must have at least 3 samples.'
    assert isinstance(x, (str, tuple)), 'x must be a string.'
    assert isinstance(y, (str, tuple)), 'y must be a string.'
    assert isinstance(covar, (str, list, type(None)))
    assert isinstance(x_covar, (str, list, type(None)))
    assert isinstance(y_covar, (str, list, type(None)))
    if covar is not None and (x_covar is not None or y_covar is not None):
        raise ValueError('Cannot specify both covar and {x,y}_covar.')
    assert x != covar, 'x and covar must be independent'
    assert y != covar, 'y and covar must be independent'
    assert x != y, 'x and y must be independent'
    # Check that columns exist
    col = _flatten_list([x, y, covar, x_covar, y_covar])
    if isinstance(covar, str):
        covar = [covar]
    if isinstance(x_covar, str):
        x_covar = [x_covar]
    if isinstance(y_covar, str):
        y_covar = [y_covar]

    assert all([c in data for c in col]), 'columns are not in dataframe.'
    # Check that columns are numeric
    assert all([data[c].dtype.kind in 'bfiu' for c in col])

    # Drop rows with NaN
    data = data[col].dropna()
    assert data.shape[0] > 2, 'Data must have at least 3 non-NAN samples.'

    # Standardize (= no need for an intercept in least-square regression)
    #This does NOT work with dummy variable for plate covariates -- so I will not standardize those
    #So, only standardize for those variables that work
    for c in col:
        if (data[c].std(axis=0) != 0):
            data[c] = (data[c] - data[c].mean(axis=0)) / data[c].std(axis=0)
    if covar is not None:
        # PARTIAL CORRELATION
        cvar = np.atleast_2d(data[covar].to_numpy())
        beta_x = np.linalg.lstsq(cvar, data[x].to_numpy(), rcond=None)[0]
        beta_y = np.linalg.lstsq(cvar, data[y].to_numpy(), rcond=None)[0]
        res_x = data[x].to_numpy() - cvar @ beta_x
        res_y = data[y].to_numpy() - cvar @ beta_y
    else:
        # SEMI-PARTIAL CORRELATION
        # Initialize "fake" residuals
        res_x, res_y = data[x].to_numpy(), data[y].to_numpy()
        if x_covar is not None:
            cvar = np.atleast_2d(C[x_covar].to_numpy())
            beta_x = np.linalg.lstsq(cvar, C[x].to_numpy(), rcond=None)[0]
            res_x = C[x].to_numpy() - cvar @ beta_x
        if y_covar is not None:
            cvar = np.atleast_2d(C[y_covar].to_numpy())
            beta_y = np.linalg.lstsq(cvar, C[y].to_numpy(), rcond=None)[0]
            res_y = C[y].to_numpy() - cvar @ beta_y
    return pg.corr(res_x, res_y, method=method, tail=tail)
コード例 #10
0
ファイル: correlation.py プロジェクト: vishalbelsare/pingouin
def partial_corr(data=None, x=None, y=None, covar=None, x_covar=None,
                 y_covar=None, tail='two-sided', method='pearson', **kwargs):
    """Partial and semi-partial correlation.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        Panddas Dataframe. Note that this function can also directly be used
        as a :py:class:`pandas.DataFrame` method, in which case this argument
        is no longer needed.
    x, y : string
        x and y. Must be names of columns in ``data``.
    covar : string or list
        Covariate(s). Must be a names of columns in ``data``. Use a list if
        there are two or more covariates.
    x_covar : string or list
        Covariate(s) for the ``x`` variable. This is used to compute
        semi-partial correlation (i.e. the effect of ``x_covar`` is removed
        from ``x`` but not from ``y``). Only one of ``covar``,  ``x_covar`` and
        ``y_covar`` can be specified.
    y_covar : string or list
        Covariate(s) for the ``y`` variable. This is used to compute
        semi-partial correlation (i.e. the effect of ``y_covar`` is removed
        from ``y`` but not from ``x``). Only one of ``covar``,  ``x_covar`` and
        ``y_covar`` can be specified.
    tail : string
        Specify whether to return `'one-sided'` or `'two-sided'` p-value.
        The former are simply half the latter.
    method : string
        Correlation type:

        * ``'pearson'``: Pearson :math:`r` product-moment correlation
        * ``'spearman'``: Spearman :math:`\\rho` rank-order correlation
        * ``'kendall'``: Kendall's :math:`\\tau_B` correlation
          (for ordinal data)
        * ``'bicor'``: Biweight midcorrelation (robust)
        * ``'percbend'``: Percentage bend correlation (robust)
        * ``'shepherd'``: Shepherd's pi correlation (robust)
        * ``'skipped'``: Skipped correlation (robust)
    **kwargs : optional
        Optional argument(s) passed to the lower-level correlation functions.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'n'``: Sample size (after removal of missing values)
        * ``'outliers'``: number of outliers, only if a robust method was used
        * ``'r'``: Correlation coefficient
        * ``'CI95'``: 95% parametric confidence intervals around :math:`r`
        * ``'p-val'``: tail of the test

    See also
    --------
    corr, pairwise_corr, rm_corr

    Notes
    -----
    From [1]_:

        *With partial correlation, we find the correlation between x
        and y holding C constant for both x and
        y. Sometimes, however, we want to hold C constant for
        just x or just y. In that case, we compute a
        semi-partial correlation. A partial correlation is computed between
        two residuals. A semi-partial correlation is computed between one
        residual and another raw (or unresidualized) variable.*

    Note that if you are not interested in calculating the p-values [2]_
    but only the partial correlation matrix, a faster
    alternative is to use :py:func:`pingouin.pcorr` (see example 4).

    Rows with missing values are automatically removed from data. Results have
    been tested against the
    `ppcor <https://cran.r-project.org/web/packages/ppcor/index.html>`_
    R package.

    References
    ----------
    .. [1] http://faculty.cas.usf.edu/mbrannick/regression/Partial.html

    .. [2] https://online.stat.psu.edu/stat505/lesson/6/6.3

    Examples
    --------
    1. Partial correlation with one covariate

    >>> import pingouin as pg
    >>> df = pg.read_dataset('partial_corr')
    >>> pg.partial_corr(data=df, x='x', y='y', covar='cv1').round(3)
              n      r         CI95%  p-val
    pearson  30  0.568  [0.25, 0.77]  0.001

    2. Spearman partial correlation with several covariates

    >>> # Partial correlation of x and y controlling for cv1, cv2 and cv3
    >>> pg.partial_corr(data=df, x='x', y='y', covar=['cv1', 'cv2', 'cv3'],
    ...                 method='spearman').round(3)
               n      r         CI95%  p-val
    spearman  30  0.491  [0.14, 0.73]  0.009

    3. As a pandas method

    >>> df.partial_corr(x='x', y='y', covar=['cv1'],
    ...                 method='spearman').round(3)
               n      r         CI95%  p-val
    spearman  30  0.568  [0.26, 0.77]  0.001

    4. Partial correlation matrix (returns only the correlation coefficients)

    >>> df.pcorr().round(3)
             x      y    cv1    cv2    cv3
    x    1.000  0.493 -0.095  0.130 -0.385
    y    0.493  1.000 -0.007  0.104 -0.002
    cv1 -0.095 -0.007  1.000 -0.241 -0.470
    cv2  0.130  0.104 -0.241  1.000 -0.118
    cv3 -0.385 -0.002 -0.470 -0.118  1.000

    5. Semi-partial correlation on x

    >>> pg.partial_corr(data=df, x='x', y='y',
    ...                 x_covar=['cv1', 'cv2', 'cv3']).round(3)
              n      r        CI95%  p-val
    pearson  30  0.463  [0.1, 0.72]  0.015
    """
    from pingouin.utils import _flatten_list
    # Safety check
    assert tail in ['two-sided', 'one-sided'], (
        'tail must be "two-sided" or "one-sided".')
    assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.'
    assert data.shape[0] > 2, 'Data must have at least 3 samples.'
    assert isinstance(x, (str, tuple)), 'x must be a string.'
    assert isinstance(y, (str, tuple)), 'y must be a string.'
    assert isinstance(covar, (str, list, type(None)))
    assert isinstance(x_covar, (str, list, type(None)))
    assert isinstance(y_covar, (str, list, type(None)))
    if covar is not None and (x_covar is not None or y_covar is not None):
        raise ValueError('Cannot specify both covar and {x,y}_covar.')
    if x_covar is not None and y_covar is not None:
        raise ValueError('Cannot specify both x_covar and y_covar.')
    assert x != covar, 'x and covar must be independent'
    assert y != covar, 'y and covar must be independent'
    assert x != y, 'x and y must be independent'
    if isinstance(covar, list):
        assert x not in covar, 'x and covar must be independent'
        assert y not in covar, 'y and covar must be independent'
    # Check that columns exist
    col = _flatten_list([x, y, covar, x_covar, y_covar])
    if isinstance(covar, str):
        covar = [covar]
    if isinstance(x_covar, str):
        x_covar = [x_covar]
    if isinstance(y_covar, str):
        y_covar = [y_covar]

    assert all([c in data for c in col]), 'columns are not in dataframe.'
    # Check that columns are numeric
    assert all([data[c].dtype.kind in 'bfiu' for c in col])

    # Drop rows with NaN
    data = data[col].dropna()
    n = data.shape[0]  # Number of samples
    k = data.shape[1] - 2  # Number of covariates
    # dof = n - k - 2
    assert n > 2, 'Data must have at least 3 non-NAN samples.'

    # Standardize (= no need for an intercept in least-square regression)
    C = (data[col] - data[col].mean(axis=0)) / data[col].std(axis=0)

    if covar is not None:
        # PARTIAL CORRELATION
        cvar = np.atleast_2d(C[covar].to_numpy())
        beta_x = np.linalg.lstsq(cvar, C[x].to_numpy(), rcond=None)[0]
        beta_y = np.linalg.lstsq(cvar, C[y].to_numpy(), rcond=None)[0]
        res_x = C[x].to_numpy() - cvar @ beta_x
        res_y = C[y].to_numpy() - cvar @ beta_y
    else:
        # SEMI-PARTIAL CORRELATION
        # Initialize "fake" residuals
        res_x, res_y = data[x].to_numpy(), data[y].to_numpy()
        if x_covar is not None:
            cvar = np.atleast_2d(C[x_covar].to_numpy())
            beta_x = np.linalg.lstsq(cvar, C[x].to_numpy(), rcond=None)[0]
            res_x = C[x].to_numpy() - cvar @ beta_x
        if y_covar is not None:
            cvar = np.atleast_2d(C[y_covar].to_numpy())
            beta_y = np.linalg.lstsq(cvar, C[y].to_numpy(), rcond=None)[0]
            res_y = C[y].to_numpy() - cvar @ beta_y

    # Compute partial correlation coefficient
    # We do not extract the p-values at this stage because they do not account
    # for the number of covariates in the degrees of freedom
    if method == 'pearson':
        r, _ = pearsonr(res_x, res_y)
    elif method == 'spearman':
        r, _ = spearmanr(res_x, res_y, **kwargs)
    elif method == 'kendall':
        r, _ = kendalltau(res_x, res_y, **kwargs)
    elif method == 'bicor':
        r, _ = bicor(res_x, res_y, **kwargs)
    elif method == 'percbend':
        r, _ = percbend(res_x, res_y, **kwargs)
    elif method == 'shepherd':
        r, _, outliers = shepherd(res_x, res_y, **kwargs)
    elif method == 'skipped':
        r, _, outliers = skipped(res_x, res_y, **kwargs)
    else:
        raise ValueError(f'Method "{method}" not recognized.')

    if np.isnan(r):
        # Correlation failed -- new in version v0.3.4, instead of raising an
        # error we just return a dataframe full of NaN (except sample size).
        # This avoid sudden stop in pingouin.pairwise_corr.
        return pd.DataFrame({'n': n, 'r': np.nan, 'CI95%': np.nan,
                             'p-val': np.nan}, index=[method])

    # Sample size after outlier removal
    n_outliers = sum(outliers) if "outliers" in locals() else 0
    n_clean = n - n_outliers

    # Compute the two-sided p-value and confidence intervals
    # https://online.stat.psu.edu/stat505/lesson/6/6.3
    pval = _correl_pvalue(r, n_clean, k)
    ci = compute_esci(
        stat=r, nx=(n_clean - k), ny=(n_clean - k), eftype='r', decimals=6)

    # Create dictionnary
    stats = {
        'n': n,
        'r': r,
        'CI95%': [ci],
        'p-val': pval if tail == 'two-sided' else .5 * pval,
    }

    if method in ['shepherd', 'skipped']:
        stats['outliers'] = n_outliers

    # Convert to DataFrame
    stats = pd.DataFrame.from_records(stats, index=[method])

    # Define order
    col_keep = ['n', 'outliers', 'r', 'CI95%', 'p-val']
    col_order = [k for k in col_keep if k in stats.keys().tolist()]
    return _postprocess_dataframe(stats)[col_order]
コード例 #11
0
def pairwise_ttests(data=None, dv=None, between=None, within=None,
                    subject=None, parametric=True, marginal=True, alpha=.05,
                    tail='two-sided', padjust='none', effsize='hedges',
                    correction='auto', nan_policy='listwise',
                    return_desc=False, interaction=True):
    """Pairwise T-tests.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        DataFrame. Note that this function can also directly be used as a
        Pandas method, in which case this argument is no longer needed.
    dv : string
        Name of column containing the dependant variable.
    between : string or list with 2 elements
        Name of column(s) containing the between-subject factor(s).

        .. warning:: Note that Pingouin gives slightly different T and
            p-values compared to JASP posthoc tests for 2-way factorial design,
            because Pingouin does not pool the standard
            error for each factor, but rather calculate each pairwise T-test
            completely independent of others.
    within : string or list with 2 elements
        Name of column(s) containing the within-subject factor(s), i.e. the
        repeated measurements.
    subject : string
        Name of column containing the subject identifier. This is compulsory
        when ``within`` is specified.
    parametric : boolean
        If True (default), use the parametric :py:func:`ttest` function.
        If False, use :py:func:`pingouin.wilcoxon` or :py:func:`pingouin.mwu`
        for paired or unpaired samples, respectively.
    marginal : boolean
        If True, average over repeated measures factor when working with mixed
        or two-way repeated measures design. For instance, in mixed design,
        the between-subject pairwise T-test(s) will be calculated after
        averaging across all levels of the within-subject repeated measures
        factor (the so-called *"marginal means"*).

        Similarly, in two-way repeated measures factor, the pairwise T-test(s)
        will be calculated after averaging across all levels of the other
        repeated measures factor.

        Setting ``marginal=True`` is recommended when doing posthoc
        testing with multiple factors in order to avoid violating the
        assumption of independence and conflating the degrees of freedom by the
        number of repeated measurements. This is the default behavior of JASP.

        .. warning:: The default behavior of Pingouin <0.3.2 was
            ``marginal = False``, which may have led to incorrect p-values
            for mixed or two-way repeated measures design. Make sure to always
            use the latest version of Pingouin.

        .. versionadded:: 0.3.2
    alpha : float
        Significance level
    tail : string
        Specify whether the alternative hypothesis is `'two-sided'` or
        `'one-sided'`. Can also be `'greater'` or `'less'` to specify the
        direction of the test. `'greater'` tests the alternative that ``x``
        has a larger mean than ``y``. If tail is `'one-sided'`, Pingouin will
        automatically infer the one-sided alternative hypothesis of the test
        based on the test statistic.
    padjust : string
        Method used for testing and adjustment of pvalues.

        * ``'none'``: no correction
        * ``'bonf'``: one-step Bonferroni correction
        * ``'sidak'``: one-step Sidak correction
        * ``'holm'``: step-down method using Bonferroni adjustments
        * ``'fdr_bh'``: Benjamini/Hochberg FDR correction
        * ``'fdr_by'``: Benjamini/Yekutieli FDR correction
    effsize : string or None
        Effect size type. Available methods are:

        * ``'none'``: no effect size
        * ``'cohen'``: Unbiased Cohen d
        * ``'hedges'``: Hedges g
        * ``'glass'``: Glass delta
        * ``'r'``: Pearson correlation coefficient
        * ``'eta-square'``: Eta-square
        * ``'odds-ratio'``: Odds ratio
        * ``'AUC'``: Area Under the Curve
        * ``'CLES'``: Common Language Effect Size
    correction : string or boolean
        For unpaired two sample T-tests, specify whether or not to correct for
        unequal variances using Welch separate variances T-test. If `'auto'`,
        it will automatically uses Welch T-test when the sample sizes are
        unequal, as recommended by Zimmerman 2004.

        .. versionadded:: 0.3.2
    nan_policy : string
        Can be `'listwise'` for listwise deletion of missing values in repeated
        measures design (= complete-case analysis) or `'pairwise'` for the
        more liberal pairwise deletion (= available-case analysis).

        .. versionadded:: 0.2.9
    return_desc : boolean
        If True, append group means and std to the output dataframe
    interaction : boolean
        If there are multiple factors and ``interaction`` is True (default),
        Pingouin will also calculate T-tests for the interaction term (see
        Notes).

        .. versionadded:: 0.2.9

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'A'``: Name of first measurement
        * ``'B'``: Name of second measurement
        * ``'Paired'``: indicates whether the two measurements are paired or
          not
        * ``'Parametric'``: indicates if (non)-parametric tests were used
        * ``'Tail'``: indicate whether the p-values are one-sided or two-sided
        * ``'T'``: T statistic (only if parametric=True)
        * ``'U-val'``: Mann-Whitney U stat (if parametric=False and unpaired
          data)
        * ``'W-val'``: Wilcoxon W stat (if parametric=False and paired data)
        * ``'dof'``: degrees of freedom (only if parametric=True)
        * ``'p-unc'``: Uncorrected p-values
        * ``'p-corr'``: Corrected p-values
        * ``'p-adjust'``: p-values correction method
        * ``'BF10'``: Bayes Factor
        * ``'hedges'``: effect size (or any effect size defined in
          ``effsize``)

    See also
    --------
    ttest, mwu, wilcoxon, compute_effsize, multicomp

    Notes
    -----
    Data are expected to be in long-format. If your data is in wide-format,
    you can use the :py:func:`pandas.melt` function to convert from wide to
    long format.

    If ``between`` or ``within`` is a list (e.g. ['col1', 'col2']),
    the function returns 1) the pairwise T-tests between each values of the
    first column, 2) the pairwise T-tests between each values of the second
    column and 3) the interaction between col1 and col2. The interaction is
    dependent of the order of the list, so ['col1', 'col2'] will not yield the
    same results as ['col2', 'col1'], and will only be calculated if
    ``interaction=True``.

    In other words, if ``between`` is a list with two elements, the output
    model is between1 + between2 + between1 * between2.

    Similarly, if ``within`` is a list with two elements, the output model is
    within1 + within2 + within1 * within2.

    If both ``between`` and ``within`` are specified, the output model is
    within + between + within * between (= mixed design).

    Missing values in repeated measurements are automatically removed using a
    listwise (default) or pairwise deletion strategy. However, you should be
    very careful since it can result in undesired values removal (especially
    for the interaction effect). We strongly recommend that you preprocess
    your data and remove the missing values before using this function.

    This function has been tested against the `pairwise.t.test
    <https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/pairwise.t.test>`_
    R function.

    .. warning:: Versions of Pingouin below 0.3.2 gave incorrect results
        for mixed and two-way repeated measures design (see above warning for
        the ``marginal`` argument).

    .. warning:: Pingouin gives slightly different results than the JASP's
        posthoc module when working with multiple factors (e.g. mixed,
        factorial or 2-way repeated measures design). This is mostly caused by
        the fact that Pingouin does not pool the standard error for
        between-subject and interaction contrasts. You should always double
        check your results with JASP or another statistical software.

    Examples
    --------
    For more examples, please refer to the `Jupyter notebooks
    <https://github.com/raphaelvallat/pingouin/blob/master/notebooks/01_ANOVA.ipynb>`_

    1. One between-subject factor

    >>> from pingouin import pairwise_ttests, read_dataset
    >>> df = read_dataset('mixed_anova.csv')
    >>> pairwise_ttests(dv='Scores', between='Group', data=df) # doctest: +SKIP

    2. One within-subject factor

    >>> post_hocs = pairwise_ttests(dv='Scores', within='Time',
    ...                             subject='Subject', data=df)
    >>> print(post_hocs)  # doctest: +SKIP

    3. Non-parametric pairwise paired test (wilcoxon)

    >>> pairwise_ttests(dv='Scores', within='Time', subject='Subject',
    ...                 data=df, parametric=False)  # doctest: +SKIP

    4. Mixed design (within and between) with bonferroni-corrected p-values

    >>> posthocs = pairwise_ttests(dv='Scores', within='Time',
    ...                            subject='Subject', between='Group',
    ...                            padjust='bonf', data=df)

    5. Two between-subject factors. The order of the list matters!

    >>> posthocs = pairwise_ttests(dv='Scores', between=['Group', 'Time'],
    ...                            data=df)

    6. Same but without the interaction

    >>> posthocs = df.pairwise_ttests(dv='Scores', between=['Group', 'Time'],
    ...                               interaction=False)
    """
    from .parametric import ttest
    from .nonparametric import wilcoxon, mwu

    # Safety checks
    _check_dataframe(dv=dv, between=between, within=within, subject=subject,
                     effects='all', data=data)
    assert tail in ['one-sided', 'two-sided', 'greater', 'less']
    assert isinstance(alpha, float), 'alpha must be float.'
    assert nan_policy in ['listwise', 'pairwise']

    # Check if we have multiple between or within factors
    multiple_between = False
    multiple_within = False
    contrast = None

    if isinstance(between, list):
        if len(between) > 1:
            multiple_between = True
            contrast = 'multiple_between'
            assert all([b in data.keys() for b in between])
        else:
            between = between[0]

    if isinstance(within, list):
        if len(within) > 1:
            multiple_within = True
            contrast = 'multiple_within'
            assert all([w in data.keys() for w in within])
        else:
            within = within[0]

    if all([multiple_within, multiple_between]):
        raise ValueError("Multiple between and within factors are",
                         "currently not supported. Please select only one.")

    # Check the other cases
    if isinstance(between, str) and within is None:
        contrast = 'simple_between'
        assert between in data.keys()
    if isinstance(within, str) and between is None:
        contrast = 'simple_within'
        assert within in data.keys()
    if isinstance(between, str) and isinstance(within, str):
        contrast = 'within_between'
        assert all([between in data.keys(), within in data.keys()])

    # Reorganize column order
    col_order = ['Contrast', 'Time', 'A', 'B', 'mean(A)', 'std(A)', 'mean(B)',
                 'std(B)', 'Paired', 'Parametric', 'T', 'U-val', 'W-val',
                 'dof', 'Tail', 'p-unc', 'p-corr', 'p-adjust', 'BF10',
                 effsize]

    if contrast in ['simple_within', 'simple_between']:
        # OPTION A: SIMPLE MAIN EFFECTS, WITHIN OR BETWEEN
        paired = True if contrast == 'simple_within' else False
        col = within if contrast == 'simple_within' else between
        # Remove NAN in repeated measurements
        if contrast == 'simple_within' and data[dv].isnull().to_numpy().any():
            # Only if nan_policy == 'listwise'. For pairwise deletion,
            # missing values will be removed directly in the lower-level
            # functions (e.g. pg.ttest)
            if nan_policy == 'listwise':
                data = remove_rm_na(dv=dv, within=within, subject=subject,
                                    data=data)
            else:
                # The `remove_rm_na` also aggregate other repeated measures
                # factor using the mean. Here, we ensure this behavior too.
                data = data.groupby([subject, within])[dv].mean().reset_index()
            # Now we check that subjects are present in all conditions
            # For example, if we have four subjects and 3 conditions,
            # and if subject 2 have missing data at the third condition,
            # we still need a row with missing values for this subject.
            if data.groupby(within)[subject].count().nunique() != 1:
                raise ValueError("Repeated measures dataframe is not balanced."
                                 " `Subjects` must have the same number of "
                                 "elements in all conditions, "
                                 "even when missing values are present.")

        # Extract effects
        grp_col = data.groupby(col, sort=False)[dv]
        labels = grp_col.groups.keys()
        # Number and labels of possible comparisons
        if len(labels) >= 2:
            combs = list(combinations(labels, 2))
            combs = np.array(combs)
            A = combs[:, 0]
            B = combs[:, 1]
        else:
            raise ValueError('Columns must have at least two unique values.')

        # Initialize dataframe
        stats = pd.DataFrame(dtype=np.float64, index=range(len(combs)),
                             columns=col_order)

        # Force dtype conversion
        cols_str = ['Contrast', 'Time', 'A', 'B', 'Tail', 'p-adjust', 'BF10']
        cols_bool = ['Parametric', 'Paired']
        stats[cols_str] = stats[cols_str].astype(object)
        stats[cols_bool] = stats[cols_bool].astype(bool)

        # Fill str columns
        stats.loc[:, 'A'] = A
        stats.loc[:, 'B'] = B
        stats.loc[:, 'Contrast'] = col
        stats.loc[:, 'Tail'] = tail
        stats.loc[:, 'Paired'] = paired

        for i in range(stats.shape[0]):
            col1, col2 = stats.at[i, 'A'], stats.at[i, 'B']
            x = grp_col.get_group(col1).to_numpy(dtype=np.float64)
            y = grp_col.get_group(col2).to_numpy(dtype=np.float64)
            if parametric:
                stat_name = 'T'
                df_ttest = ttest(x, y, paired=paired, tail=tail,
                                 correction=correction)
                stats.at[i, 'BF10'] = df_ttest.at['T-test', 'BF10']
                stats.at[i, 'dof'] = df_ttest.at['T-test', 'dof']
            else:
                if paired:
                    stat_name = 'W-val'
                    df_ttest = wilcoxon(x, y, tail=tail)
                else:
                    stat_name = 'U-val'
                    df_ttest = mwu(x, y, tail=tail)

            # Compute Hedges / Cohen
            ef = compute_effsize(x=x, y=y, eftype=effsize, paired=paired)

            if return_desc:
                stats.at[i, 'mean(A)'] = np.nanmean(x)
                stats.at[i, 'mean(B)'] = np.nanmean(y)
                stats.at[i, 'std(A)'] = np.nanstd(x, ddof=1)
                stats.at[i, 'std(B)'] = np.nanstd(y, ddof=1)
            stats.at[i, stat_name] = df_ttest[stat_name].iat[0]
            stats.at[i, 'p-unc'] = df_ttest['p-val'].iat[0]
            stats.at[i, effsize] = ef

        # Multiple comparisons
        padjust = None if stats['p-unc'].size <= 1 else padjust
        if padjust is not None:
            if padjust.lower() != 'none':
                _, stats['p-corr'] = multicomp(stats['p-unc'].to_numpy(),
                                               alpha=alpha, method=padjust)
                stats['p-adjust'] = padjust
        else:
            stats['p-corr'] = None
            stats['p-adjust'] = None
    else:
        # Multiple factors
        if contrast == 'multiple_between':
            # B1: BETWEEN1 + BETWEEN2 + BETWEEN1 * BETWEEN2
            factors = between
            fbt = factors
            fwt = [None, None]
            paired = False  # the interaction is not paired
            agg = [False, False]
            # TODO: add a pool SD option, as in JASP and JAMOVI?
        elif contrast == 'multiple_within':
            # B2: WITHIN1 + WITHIN2 + WITHIN1 * WITHIN2
            factors = within
            fbt = [None, None]
            fwt = factors
            paired = True
            agg = [True, True]  # Calculate marginal means for both factors
        else:
            # B3: WITHIN + BETWEEN + WITHIN * BETWEEN
            factors = [within, between]
            fbt = [None, between]
            fwt = [within, None]
            paired = False
            agg = [False, True]

        stats = pd.DataFrame()
        for i, f in enumerate(factors):
            # Introduced in Pingouin v0.3.2
            if all([agg[i], marginal]):
                tmp = data.groupby([subject, f], as_index=False,
                                   sort=False).mean()
            else:
                tmp = data
            stats = stats.append(pairwise_ttests(dv=dv,
                                                 between=fbt[i],
                                                 within=fwt[i],
                                                 subject=subject,
                                                 data=tmp,
                                                 parametric=parametric,
                                                 marginal=marginal,
                                                 alpha=alpha,
                                                 tail=tail,
                                                 padjust=padjust,
                                                 effsize=effsize,
                                                 correction=correction,
                                                 nan_policy=nan_policy,
                                                 return_desc=return_desc),
                                 ignore_index=True, sort=False)

        # Then compute the interaction between the factors
        if interaction:
            nrows = stats.shape[0]
            grp_fac1 = data.groupby(factors[0], sort=False)[dv]
            grp_fac2 = data.groupby(factors[1], sort=False)[dv]
            grp_both = data.groupby(factors, sort=False)[dv]
            labels_fac1 = grp_fac1.groups.keys()
            labels_fac2 = grp_fac2.groups.keys()
            # comb_fac1 = list(combinations(labels_fac1, 2))
            comb_fac2 = list(combinations(labels_fac2, 2))

            # Pairwise comparisons
            combs_list = list(product(labels_fac1, comb_fac2))
            ncombs = len(combs_list)
            # np.array(combs_list) does not work because of tuples
            # we therefore need to flatten the tupple
            combs = np.zeros(shape=(ncombs, 3), dtype=object)
            for i in range(ncombs):
                combs[i] = _flatten_list(combs_list[i], include_tuple=True)

            # Append empty rows
            idxiter = np.arange(nrows, nrows + ncombs)
            stats = stats.append(pd.DataFrame(columns=stats.columns,
                                 index=idxiter), ignore_index=True)
            # Update other columns
            stats.loc[idxiter, 'Contrast'] = factors[0] + ' * ' + factors[1]
            stats.loc[idxiter, 'Time'] = combs[:, 0]
            stats.loc[idxiter, 'Paired'] = paired
            stats.loc[idxiter, 'Tail'] = tail
            stats.loc[idxiter, 'A'] = combs[:, 1]
            stats.loc[idxiter, 'B'] = combs[:, 2]

            for i, comb in enumerate(combs):
                ic = nrows + i  # Take into account previous rows
                fac1, col1, col2 = comb
                x = grp_both.get_group((fac1, col1)).to_numpy(dtype=np.float64)
                y = grp_both.get_group((fac1, col2)).to_numpy(dtype=np.float64)
                ef = compute_effsize(x=x, y=y, eftype=effsize, paired=paired)
                if parametric:
                    stat_name = 'T'
                    df_ttest = ttest(x, y, paired=paired, tail=tail,
                                     correction=correction)
                    stats.at[ic, 'BF10'] = df_ttest.at['T-test', 'BF10']
                    stats.at[ic, 'dof'] = df_ttest.at['T-test', 'dof']
                else:
                    if paired:
                        stat_name = 'W-val'
                        df_ttest = wilcoxon(x, y, tail=tail)
                    else:
                        stat_name = 'U-val'
                        df_ttest = mwu(x, y, tail=tail)

                # Append to stats
                if return_desc:
                    stats.at[ic, 'mean(A)'] = np.nanmean(x)
                    stats.at[ic, 'mean(B)'] = np.nanmean(y)
                    stats.at[ic, 'std(A)'] = np.nanstd(x, ddof=1)
                    stats.at[ic, 'std(B)'] = np.nanstd(y, ddof=1)
                stats.at[ic, stat_name] = df_ttest[stat_name].iat[0]
                stats.at[ic, 'p-unc'] = df_ttest['p-val'].iat[0]
                stats.at[ic, effsize] = ef

            # Multi-comparison columns
            if padjust is not None and padjust.lower() != 'none':
                _, pcor = multicomp(stats.loc[idxiter, 'p-unc'].to_numpy(),
                                    alpha=alpha, method=padjust)
                stats.loc[idxiter, 'p-corr'] = pcor
                stats.loc[idxiter, 'p-adjust'] = padjust

    # ---------------------------------------------------------------------
    # Append parametric columns
    stats.loc[:, 'Parametric'] = parametric

    # Reorder and drop empty columns
    stats = stats[np.array(col_order)[np.isin(col_order, stats.columns)]]
    stats = stats.dropna(how='all', axis=1)

    # Rename Time columns
    if (contrast in ['multiple_within', 'multiple_between', 'within_between']
       and interaction):
        stats['Time'].fillna('-', inplace=True)
        stats.rename(columns={'Time': factors[0]}, inplace=True)

    return stats