Example #1
0
    def test__postprocess_dataframe(self):
        """Test function _postprocess_dataframe."""
        df2 = df.copy()
        # add some more values and give a stringy index
        df2.Values = [1.54321, 5.87654, 8.23456, 3.45678]
        df2 = df2.assign(Values2=[1.54321, 5.87654, 8.23456, 3.45678])
        df2.index = ['row' + str(x) for x in df.index]

        # set rounding options (keeping original options dict to restore after)
        old_opts = pingouin.options.copy()
        pingouin.options.clear()
        pingouin.options['round'] = 4
        pingouin.options['round.cell.[row0]x[Values]'] = None
        pingouin.options['round.column.Values'] = 3
        pingouin.options['round.row.row1'] = 2
        pingouin.options['round.cell.[row3]x[Values2]'] = 0

        df_expected = df2.copy()
        df_expected.Values = [1.54321, 5.877, 8.235, 3.457]
        df_expected.Values2 = [1.5432, 5.88, 8.2346, 3.0]

        df2 = _postprocess_dataframe(df2)
        pd.testing.assert_frame_equal(df2, df_expected)

        # restore old options
        pingouin.options.update(old_opts)
Example #2
0
def multivariate_ttest(X, Y=None, paired=False):
    """Hotelling T-squared test (= multivariate T-test)

    Parameters
    ----------
    X : np.array
        First data matrix of shape (n_samples, n_features).
    Y : np.array or None
        Second data matrix of shape (n_samples, n_features). If ``Y`` is a 1D
        array of shape (n_features), a one-sample test is performed where the
        null hypothesis is defined in ``Y``. If ``Y`` is None, a one-sample
        is performed against np.zeros(n_features).
    paired : boolean
        Specify whether the two observations are related (i.e. repeated
        measures) or independent. If ``paired`` is True, ``X`` and ``Y`` must
        have exactly the same shape.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'T2'``: T-squared value
        * ``'F'``: F-value
        * ``'df1'``: first degree of freedom
        * ``'df2'``: second degree of freedom
        * ``'p-val'``: p-value

    See Also
    --------
    multivariate_normality : Multivariate normality test.
    ttest : Univariate T-test.

    Notes
    -----
    The Hotelling 's T-squared test [1]_ is the multivariate counterpart of
    the T-test.

    Rows with missing values are automatically removed using the
    :py:func:`remove_na` function.

    Tested against the `Hotelling
    <https://cran.r-project.org/web/packages/Hotelling/Hotelling.pdf>`_ R
    package.

    References
    ----------
    .. [1] Hotelling, H. The Generalization of Student's Ratio. Ann. Math.
           Statist. 2 (1931), no. 3, 360--378.

    See also http://www.real-statistics.com/multivariate-statistics/

    Examples
    --------
    Two-sample independent Hotelling T-squared test

    >>> import pingouin as pg
    >>> data = pg.read_dataset('multivariate')
    >>> dvs = ['Fever', 'Pressure', 'Aches']
    >>> X = data[data['Condition'] == 'Drug'][dvs]
    >>> Y = data[data['Condition'] == 'Placebo'][dvs]
    >>> pg.multivariate_ttest(X, Y)
                     T2         F  df1  df2      pval
    hotelling  4.228679  1.326644    3   32  0.282898

    Two-sample paired Hotelling T-squared test

    >>> pg.multivariate_ttest(X, Y, paired=True)
                     T2         F  df1  df2      pval
    hotelling  4.468456  1.314252    3   15  0.306542

    One-sample Hotelling T-squared test with a specified null hypothesis

    >>> null_hypothesis_means = [37.5, 70, 5]
    >>> pg.multivariate_ttest(X, Y=null_hypothesis_means)
                       T2          F  df1  df2          pval
    hotelling  253.230991  74.479703    3   15  3.081281e-09
    """
    from scipy.stats import f
    x = np.asarray(X)
    assert x.ndim == 2, 'x must be of shape (n_samples, n_features)'

    if Y is None:
        y = np.zeros(x.shape[1])
        # Remove rows with missing values in x
        x = x[~np.isnan(x).any(axis=1)]
    else:
        nx, kx = x.shape
        y = np.asarray(Y)
        assert y.ndim in [1, 2], 'Y must be 1D or 2D.'
        if y.ndim == 1:
            # One sample with specified null
            assert y.size == kx
        else:
            # Two-sample
            err = 'X and Y must have the same number of features (= columns).'
            assert y.shape[1] == kx, err
            if paired:
                err = 'X and Y must have the same number of rows if paired.'
                assert y.shape[0] == nx, err
        # Remove rows with missing values in both x and y
        x, y = remove_na(x, y, paired=paired, axis='rows')

    # Shape of arrays
    nx, k = x.shape
    ny = y.shape[0]
    assert nx >= 5, 'At least five samples are required.'

    if y.ndim == 1 or paired is True:
        n = nx
        if y.ndim == 1:
            # One sample test
            cov = np.cov(x, rowvar=False)
            diff = x.mean(0) - y
        else:
            # Paired two sample
            cov = np.cov(x - y, rowvar=False)
            diff = x.mean(0) - y.mean(0)
        inv_cov = np.linalg.pinv(cov)
        t2 = (diff @ inv_cov) @ diff * n
    else:
        n = nx + ny - 1
        x_cov = np.cov(x, rowvar=False)
        y_cov = np.cov(y, rowvar=False)
        pooled_cov = ((nx - 1) * x_cov + (ny - 1) * y_cov) / (n - 1)
        inv_cov = np.linalg.pinv((1 / nx + 1 / ny) * pooled_cov)
        diff = x.mean(0) - y.mean(0)
        t2 = (diff @ inv_cov) @ diff

    # F-value, degrees of freedom and p-value
    fval = t2 * (n - k) / (k * (n - 1))
    df1 = k
    df2 = n - k
    pval = f.sf(fval, df1, df2)

    # Create output dictionnary
    stats = {'T2': t2, 'F': fval, 'df1': df1, 'df2': df2, 'pval': pval}
    stats = pd.DataFrame(stats, index=['hotelling'])
    return _postprocess_dataframe(stats)
Example #3
0
def tost(x, y, bound=1, paired=False, correction=False):
    """Two One-Sided Test (TOST) for equivalence.

    Parameters
    ----------
    x, y : array_like
        First and second set of observations. ``x`` and ``y`` should have the
        same units. If ``y`` is a single value (e.g. 0), a one-sample test is
        performed.
    bound : float
        Magnitude of region of similarity (a.k.a epsilon). Note that this
        should be expressed in the same unit as ``x`` and ``y``.
    paired : boolean
        Specify whether the two observations are related (i.e. repeated
        measures) or independent.
    correction : auto or boolean
        Specify whether or not to correct for unequal variances using Welch
        separate variances T-test. This only applies if ``paired`` is False.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'bound'``: bound (= epsilon, or equivalence margin)
        * ``'dof'``: degrees of freedom
        * ``'pval'``: TOST p-value

    See also
    --------
    ttest

    References
    ----------
    .. [1] Schuirmann, D.L. 1981. On hypothesis testing to determine if the
           mean of a normal distribution is contained in a known interval.
           Biometrics 37 617.

    .. [2] https://cran.r-project.org/web/packages/equivalence/equivalence.pdf

    Examples
    --------
    1. Independent two-sample TOST with a region of similarity of 1 (default)

    >>> import pingouin as pg
    >>> a = [4, 7, 8, 6, 3, 2]
    >>> b = [6, 8, 7, 10, 11, 9]
    >>> pg.tost(a, b)
          bound  dof      pval
    TOST      1   10  0.965097

    2. Paired TOST with a different region of similarity

    >>> pg.tost(a, b, bound=0.5, paired=True)
          bound  dof      pval
    TOST    0.5    5  0.954854

    3. One sample TOST

    >>> pg.tost(a, y=0, bound=4)
          bound  dof      pval
    TOST      4    5  0.825967
    """
    x = np.asarray(x)
    y = np.asarray(y)
    assert isinstance(bound, (int, float)), 'bound must be int or float.'

    # T-tests
    df_a = ttest(x + bound,
                 y,
                 paired=paired,
                 correction=correction,
                 alternative='greater')
    df_b = ttest(x - bound,
                 y,
                 paired=paired,
                 correction=correction,
                 alternative='less')
    pval = max(df_a.at['T-test', 'p-val'], df_b.at['T-test', 'p-val'])

    # Create output dataframe
    stats = pd.DataFrame(
        {
            'bound': bound,
            'dof': df_a.at['T-test', 'dof'],
            'pval': pval
        },
        index=['TOST'])
    return _postprocess_dataframe(stats)
Example #4
0
def corr(x, y, tail='two-sided', method='pearson', **kwargs):
    """(Robust) correlation between two variables.

    Parameters
    ----------
    x, y : array_like
        First and second set of observations. ``x`` and ``y`` must be
        independent.
    tail : string
        Specify whether to return ``'one-sided'`` or ``'two-sided'`` p-value.
        Note that the former are simply half the latter.
    method : string
        Correlation type:

        * ``'pearson'``: Pearson :math:`r` product-moment correlation
        * ``'spearman'``: Spearman :math:`\\rho` rank-order correlation
        * ``'kendall'``: Kendall's :math:`\\tau_B` correlation
          (for ordinal data)
        * ``'bicor'``: Biweight midcorrelation (robust)
        * ``'percbend'``: Percentage bend correlation (robust)
        * ``'shepherd'``: Shepherd's pi correlation (robust)
        * ``'skipped'``: Skipped correlation (robust)
    **kwargs : optional
        Optional argument(s) passed to the lower-level functions.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'n'``: Sample size (after removal of missing values)
        * ``'outliers'``: number of outliers, only if a robust method was used
        * ``'r'``: Correlation coefficient
        * ``'CI95'``: 95% parametric confidence intervals around :math:`r`
        * ``'r2'``: R-squared (:math:`= r^2`)
        * ``'adj_r2'``: Adjusted R-squared
        * ``'p-val'``: tail of the test
        * ``'BF10'``: Bayes Factor of the alternative hypothesis
          (only for Pearson correlation)
        * ``'power'``: achieved power of the test (= 1 - type II error).

    See also
    --------
    pairwise_corr : Pairwise correlation between columns of a pandas DataFrame
    partial_corr : Partial correlation
    rm_corr : Repeated measures correlation

    Notes
    -----
    The `Pearson correlation coefficient
    <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_
    measures the linear relationship between two datasets. Strictly speaking,
    Pearson's correlation requires that each dataset be normally distributed.
    Correlations of -1 or +1 imply a perfect negative and positive linear
    relationship, respectively, with 0 indicating the absence of association.

    .. math::
        r_{xy} = \\frac{\\sum_i(x_i - \\bar{x})(y_i - \\bar{y})}
        {\\sqrt{\\sum_i(x_i - \\bar{x})^2} \\sqrt{\\sum_i(y_i - \\bar{y})^2}}
        = \\frac{\\text{cov}(x, y)}{\\sigma_x \\sigma_y}

    where :math:`\\text{cov}` is the sample covariance and :math:`\\sigma`
    is the sample standard deviation.

    If ``method='pearson'``, The Bayes Factor is calculated using the
    :py:func:`pingouin.bayesfactor_pearson` function.

    The `Spearman correlation coefficient
    <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_
    is a non-parametric measure of the monotonicity of the relationship between
    two datasets. Unlike the Pearson correlation, the Spearman correlation does
    not assume that both datasets are normally distributed. Correlations of -1
    or +1 imply an exact negative and positive monotonic relationship,
    respectively. Mathematically, the Spearman correlation coefficient is
    defined as the Pearson correlation coefficient between the
    `rank variables <https://en.wikipedia.org/wiki/Ranking>`_.

    The `Kendall correlation coefficient
    <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_
    is a measure of the correspondence between two rankings. Values also range
    from -1 (perfect disagreement) to 1 (perfect agreement), with 0 indicating
    the absence of association. Consistent with
    :py:func:`scipy.stats.kendalltau`, Pingouin returns the Tau-b coefficient,
    which adjusts for ties:

    .. math:: \\tau_B = \\frac{(P - Q)}{\\sqrt{(P + Q + T) (P + Q + U)}}

    where :math:`P` is the number of concordant pairs, :math:`Q` the number of
    discordand pairs, :math:`T` the number of ties in x, and :math:`U`
    the number of ties in y.

    The `biweight midcorrelation
    <https://en.wikipedia.org/wiki/Biweight_midcorrelation>`_ and
    percentage bend correlation [1]_ are both robust methods that
    protects against *univariate* outliers by down-weighting observations that
    deviate too much from the median.

    The Shepherd pi [2]_ correlation and skipped [3]_, [4]_ correlation are
    both robust methods that returns the Spearman correlation coefficient after
    removing *bivariate* outliers. Briefly, the Shepherd pi uses a
    bootstrapping of the Mahalanobis distance to identify outliers, while the
    skipped correlation is based on the minimum covariance determinant
    (which requires scikit-learn). Note that these two methods are
    significantly slower than the previous ones.

    .. important:: Please note that rows with missing values (NaN) are
        automatically removed.

    References
    ----------
    .. [1] Wilcox, R.R., 1994. The percentage bend correlation coefficient.
       Psychometrika 59, 601–616. https://doi.org/10.1007/BF02294395

    .. [2] Schwarzkopf, D.S., De Haas, B., Rees, G., 2012. Better ways to
       improve standards in brain-behavior correlation analysis. Front.
       Hum. Neurosci. 6, 200. https://doi.org/10.3389/fnhum.2012.00200

    .. [3] Rousselet, G.A., Pernet, C.R., 2012. Improving standards in
       brain-behavior correlation analyses. Front. Hum. Neurosci. 6, 119.
       https://doi.org/10.3389/fnhum.2012.00119

    .. [4] Pernet, C.R., Wilcox, R., Rousselet, G.A., 2012. Robust correlation
       analyses: false positive and power validation using a new open
       source matlab toolbox. Front. Psychol. 3, 606.
       https://doi.org/10.3389/fpsyg.2012.00606

    Examples
    --------
    1. Pearson correlation

    >>> import numpy as np
    >>> import pingouin as pg
    >>> # Generate random correlated samples
    >>> np.random.seed(123)
    >>> mean, cov = [4, 6], [(1, .5), (.5, 1)]
    >>> x, y = np.random.multivariate_normal(mean, cov, 30).T
    >>> # Compute Pearson correlation
    >>> pg.corr(x, y).round(3)
              n      r         CI95%     r2  adj_r2  p-val  BF10  power
    pearson  30  0.491  [0.16, 0.72]  0.242   0.185  0.006  8.55  0.809

    2. Pearson correlation with two outliers

    >>> x[3], y[5] = 12, -8
    >>> pg.corr(x, y).round(3)
              n      r          CI95%     r2  adj_r2  p-val   BF10  power
    pearson  30  0.147  [-0.23, 0.48]  0.022  -0.051  0.439  0.302  0.121

    3. Spearman correlation (robust to outliers)

    >>> pg.corr(x, y, method="spearman").round(3)
               n      r         CI95%     r2  adj_r2  p-val  power
    spearman  30  0.401  [0.05, 0.67]  0.161   0.099  0.028   0.61

    4. Biweight midcorrelation (robust)

    >>> pg.corr(x, y, method="bicor").round(3)
            n      r         CI95%     r2  adj_r2  p-val  power
    bicor  30  0.393  [0.04, 0.66]  0.155   0.092  0.031  0.592

    5. Percentage bend correlation (robust)

    >>> pg.corr(x, y, method='percbend').round(3)
               n      r         CI95%     r2  adj_r2  p-val  power
    percbend  30  0.389  [0.03, 0.66]  0.151   0.089  0.034  0.581

    6. Shepherd's pi correlation (robust)

    >>> pg.corr(x, y, method='shepherd').round(3)
               n  outliers      r         CI95%     r2  adj_r2  p-val  power
    shepherd  30         2  0.437  [0.09, 0.69]  0.191   0.131   0.02  0.694

    7. Skipped spearman correlation (robust)

    >>> pg.corr(x, y, method='skipped').round(3)
              n  outliers      r         CI95%     r2  adj_r2  p-val  power
    skipped  30         2  0.437  [0.09, 0.69]  0.191   0.131   0.02  0.694

    8. One-tailed Pearson correlation

    >>> pg.corr(x, y, tail="one-sided", method='pearson').round(3)
              n      r          CI95%     r2  adj_r2  p-val   BF10  power
    pearson  30  0.147  [-0.23, 0.48]  0.022  -0.051   0.22  0.467  0.194

    9. Using columns of a pandas dataframe

    >>> import pandas as pd
    >>> data = pd.DataFrame({'x': x, 'y': y})
    >>> pg.corr(data['x'], data['y']).round(3)
              n      r          CI95%     r2  adj_r2  p-val   BF10  power
    pearson  30  0.147  [-0.23, 0.48]  0.022  -0.051  0.439  0.302  0.121
    """
    # Safety check
    x = np.asarray(x)
    y = np.asarray(y)
    assert x.ndim == y.ndim == 1, 'x and y must be 1D array.'
    assert x.size == y.size, 'x and y must have the same length.'
    _msg = 'tail must be "two-sided" or "one-sided".'
    assert tail in ['two-sided', 'one-sided'], _msg

    # Remove rows with missing values
    x, y = remove_na(x, y, paired=True)
    nx = x.size

    # Compute correlation coefficient
    if method == 'pearson':
        r, pval = pearsonr(x, y)
    elif method == 'spearman':
        r, pval = spearmanr(x, y, **kwargs)
    elif method == 'kendall':
        r, pval = kendalltau(x, y, **kwargs)
    elif method == 'bicor':
        r, pval = bicor(x, y, **kwargs)
    elif method == 'percbend':
        r, pval = percbend(x, y, **kwargs)
    elif method == 'shepherd':
        r, pval, outliers = shepherd(x, y, **kwargs)
    elif method == 'skipped':
        r, pval, outliers = skipped(x, y, **kwargs)
    else:
        raise ValueError(f'Method "{method}" not recognized.')

    if np.isnan(r):
        # Correlation failed -- new in version v0.3.4, instead of raising an
        # error we just return a dataframe full of NaN (except sample size).
        # This avoid sudden stop in pingouin.pairwise_corr.
        return pd.DataFrame({'n': nx, 'r': np.nan, 'CI95%': np.nan,
                             'r2': np.nan, 'adj_r2': np.nan, 'p-val': np.nan,
                             'BF10': np.nan, 'power': np.nan}, index=[method])

    # Compute r2 and adj_r2
    r2 = r**2
    adj_r2 = 1 - (((1 - r2) * (nx - 1)) / (nx - 3))

    # Compute the parametric 95% confidence interval and power
    ci = compute_esci(stat=r, nx=nx, ny=nx, eftype='r', decimals=6)
    pr = power_corr(r=r, n=nx, power=None, alpha=0.05, tail=tail),

    # Create dictionnary
    stats = {'n': nx,
             'r': r,
             'r2': r2,
             'adj_r2': adj_r2,
             'CI95%': [ci],
             'p-val': pval if tail == 'two-sided' else .5 * pval,
             'power': pr
             }

    if method in ['shepherd', 'skipped']:
        stats['outliers'] = sum(outliers)

    # Compute the BF10 for Pearson correlation only
    if method == 'pearson':
        stats['BF10'] = bayesfactor_pearson(r, nx, tail=tail)

    # Convert to DataFrame
    stats = pd.DataFrame.from_records(stats, index=[method])

    # Define order
    col_keep = ['n', 'outliers', 'r', 'CI95%', 'r2', 'adj_r2', 'p-val',
                'BF10', 'power']
    col_order = [k for k in col_keep if k in stats.keys().tolist()]
    return _postprocess_dataframe(stats)[col_order]
Example #5
0
def rm_corr(data=None, x=None, y=None, subject=None, tail='two-sided'):
    """Repeated measures correlation.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        Dataframe.
    x, y : string
        Name of columns in ``data`` containing the two dependent variables.
    subject : string
        Name of column in ``data`` containing the subject indicator.
    tail : string
        Specify whether to return 'one-sided' or 'two-sided' p-value.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'r'``: Repeated measures correlation coefficient
        * ``'dof'``: Degrees of freedom
        * ``'pval'``: one or two tailed p-value
        * ``'CI95'``: 95% parametric confidence intervals
        * ``'power'``: achieved power of the test (= 1 - type II error).

    See also
    --------
    plot_rm_corr

    Notes
    -----
    Repeated measures correlation (rmcorr) is a statistical technique
    for determining the common within-individual association for paired
    measures assessed on two or more occasions for multiple individuals.

    From `Bakdash and Marusich (2017)
    <https://doi.org/10.3389/fpsyg.2017.00456>`_:

        *Rmcorr accounts for non-independence among observations using analysis
        of covariance (ANCOVA) to statistically adjust for inter-individual
        variability. By removing measured variance between-participants,
        rmcorr provides the best linear fit for each participant using parallel
        regression lines (the same slope) with varying intercepts.
        Like a Pearson correlation coefficient, the rmcorr coefficient
        is bounded by − 1 to 1 and represents the strength of the linear
        association between two variables.*

    Results have been tested against the
    `rmcorr <https://github.com/cran/rmcorr>`_ R package.

    Please note that missing values are automatically removed from the
    dataframe (listwise deletion).

    Examples
    --------
    >>> import pingouin as pg
    >>> df = pg.read_dataset('rm_corr')
    >>> pg.rm_corr(data=df, x='pH', y='PacO2', subject='Subject')
                   r  dof      pval           CI95%     power
    rm_corr -0.50677   38  0.000847  [-0.71, -0.23]  0.929579

    Now plot using the :py:func:`pingouin.plot_rm_corr` function:

    .. plot::

        >>> import pingouin as pg
        >>> df = pg.read_dataset('rm_corr')
        >>> g = pg.plot_rm_corr(data=df, x='pH', y='PacO2', subject='Subject')
    """
    from pingouin import ancova, power_corr
    # Safety checks
    assert isinstance(data, pd.DataFrame), 'Data must be a DataFrame'
    assert x in data.columns, 'The %s column is not in data.' % x
    assert y in data.columns, 'The %s column is not in data.' % y
    assert data[x].dtype.kind in 'bfiu', '%s must be numeric.' % x
    assert data[y].dtype.kind in 'bfiu', '%s must be numeric.' % y
    assert subject in data.columns, 'The %s column is not in data.' % subject
    if data[subject].nunique() < 3:
        raise ValueError('rm_corr requires at least 3 unique subjects.')

    # Remove missing values
    data = data[[x, y, subject]].dropna(axis=0)

    # Using PINGOUIN
    # For max precision, make sure rounding is disabled
    old_options = options.copy()
    options['round'] = None
    aov = ancova(dv=y, covar=x, between=subject, data=data)
    options.update(old_options)  # restore options
    bw = aov.bw_  # Beta within parameter
    sign = np.sign(bw)
    dof = int(aov.at[2, 'DF'])
    n = dof + 2
    ssfactor = aov.at[1, 'SS']
    sserror = aov.at[2, 'SS']
    rm = sign * np.sqrt(ssfactor / (ssfactor + sserror))
    pval = aov.at[1, 'p-unc']
    pval = pval * 0.5 if tail == 'one-sided' else pval
    ci = compute_esci(stat=rm, nx=n, eftype='pearson').tolist()
    pwr = power_corr(r=rm, n=n, tail=tail)
    # Convert to Dataframe
    stats = pd.DataFrame({"r": rm,
                          "dof": int(dof),
                          "pval": pval,
                          "CI95%": [ci],
                          "power": pwr}, index=["rm_corr"])
    return _postprocess_dataframe(stats)
Example #6
0
def partial_corr(data=None, x=None, y=None, covar=None, x_covar=None,
                 y_covar=None, tail='two-sided', method='pearson', **kwargs):
    """Partial and semi-partial correlation.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        Panddas Dataframe. Note that this function can also directly be used
        as a :py:class:`pandas.DataFrame` method, in which case this argument
        is no longer needed.
    x, y : string
        x and y. Must be names of columns in ``data``.
    covar : string or list
        Covariate(s). Must be a names of columns in ``data``. Use a list if
        there are two or more covariates.
    x_covar : string or list
        Covariate(s) for the ``x`` variable. This is used to compute
        semi-partial correlation (i.e. the effect of ``x_covar`` is removed
        from ``x`` but not from ``y``). Only one of ``covar``,  ``x_covar`` and
        ``y_covar`` can be specified.
    y_covar : string or list
        Covariate(s) for the ``y`` variable. This is used to compute
        semi-partial correlation (i.e. the effect of ``y_covar`` is removed
        from ``y`` but not from ``x``). Only one of ``covar``,  ``x_covar`` and
        ``y_covar`` can be specified.
    tail : string
        Specify whether to return `'one-sided'` or `'two-sided'` p-value.
        The former are simply half the latter.
    method : string
        Correlation type:

        * ``'pearson'``: Pearson :math:`r` product-moment correlation
        * ``'spearman'``: Spearman :math:`\\rho` rank-order correlation
        * ``'kendall'``: Kendall's :math:`\\tau_B` correlation
          (for ordinal data)
        * ``'bicor'``: Biweight midcorrelation (robust)
        * ``'percbend'``: Percentage bend correlation (robust)
        * ``'shepherd'``: Shepherd's pi correlation (robust)
        * ``'skipped'``: Skipped correlation (robust)
    **kwargs : optional
        Optional argument(s) passed to the lower-level correlation functions.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'n'``: Sample size (after removal of missing values)
        * ``'outliers'``: number of outliers, only if a robust method was used
        * ``'r'``: Correlation coefficient
        * ``'CI95'``: 95% parametric confidence intervals around :math:`r`
        * ``'p-val'``: tail of the test

    See also
    --------
    corr, pairwise_corr, rm_corr

    Notes
    -----
    From [1]_:

        *With partial correlation, we find the correlation between x
        and y holding C constant for both x and
        y. Sometimes, however, we want to hold C constant for
        just x or just y. In that case, we compute a
        semi-partial correlation. A partial correlation is computed between
        two residuals. A semi-partial correlation is computed between one
        residual and another raw (or unresidualized) variable.*

    Note that if you are not interested in calculating the p-values [2]_
    but only the partial correlation matrix, a faster
    alternative is to use :py:func:`pingouin.pcorr` (see example 4).

    Rows with missing values are automatically removed from data. Results have
    been tested against the
    `ppcor <https://cran.r-project.org/web/packages/ppcor/index.html>`_
    R package.

    References
    ----------
    .. [1] http://faculty.cas.usf.edu/mbrannick/regression/Partial.html

    .. [2] https://online.stat.psu.edu/stat505/lesson/6/6.3

    Examples
    --------
    1. Partial correlation with one covariate

    >>> import pingouin as pg
    >>> df = pg.read_dataset('partial_corr')
    >>> pg.partial_corr(data=df, x='x', y='y', covar='cv1').round(3)
              n      r         CI95%  p-val
    pearson  30  0.568  [0.25, 0.77]  0.001

    2. Spearman partial correlation with several covariates

    >>> # Partial correlation of x and y controlling for cv1, cv2 and cv3
    >>> pg.partial_corr(data=df, x='x', y='y', covar=['cv1', 'cv2', 'cv3'],
    ...                 method='spearman').round(3)
               n      r         CI95%  p-val
    spearman  30  0.491  [0.14, 0.73]  0.009

    3. As a pandas method

    >>> df.partial_corr(x='x', y='y', covar=['cv1'],
    ...                 method='spearman').round(3)
               n      r         CI95%  p-val
    spearman  30  0.568  [0.26, 0.77]  0.001

    4. Partial correlation matrix (returns only the correlation coefficients)

    >>> df.pcorr().round(3)
             x      y    cv1    cv2    cv3
    x    1.000  0.493 -0.095  0.130 -0.385
    y    0.493  1.000 -0.007  0.104 -0.002
    cv1 -0.095 -0.007  1.000 -0.241 -0.470
    cv2  0.130  0.104 -0.241  1.000 -0.118
    cv3 -0.385 -0.002 -0.470 -0.118  1.000

    5. Semi-partial correlation on x

    >>> pg.partial_corr(data=df, x='x', y='y',
    ...                 x_covar=['cv1', 'cv2', 'cv3']).round(3)
              n      r        CI95%  p-val
    pearson  30  0.463  [0.1, 0.72]  0.015
    """
    from pingouin.utils import _flatten_list
    # Safety check
    assert tail in ['two-sided', 'one-sided'], (
        'tail must be "two-sided" or "one-sided".')
    assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.'
    assert data.shape[0] > 2, 'Data must have at least 3 samples.'
    assert isinstance(x, (str, tuple)), 'x must be a string.'
    assert isinstance(y, (str, tuple)), 'y must be a string.'
    assert isinstance(covar, (str, list, type(None)))
    assert isinstance(x_covar, (str, list, type(None)))
    assert isinstance(y_covar, (str, list, type(None)))
    if covar is not None and (x_covar is not None or y_covar is not None):
        raise ValueError('Cannot specify both covar and {x,y}_covar.')
    if x_covar is not None and y_covar is not None:
        raise ValueError('Cannot specify both x_covar and y_covar.')
    assert x != covar, 'x and covar must be independent'
    assert y != covar, 'y and covar must be independent'
    assert x != y, 'x and y must be independent'
    if isinstance(covar, list):
        assert x not in covar, 'x and covar must be independent'
        assert y not in covar, 'y and covar must be independent'
    # Check that columns exist
    col = _flatten_list([x, y, covar, x_covar, y_covar])
    if isinstance(covar, str):
        covar = [covar]
    if isinstance(x_covar, str):
        x_covar = [x_covar]
    if isinstance(y_covar, str):
        y_covar = [y_covar]

    assert all([c in data for c in col]), 'columns are not in dataframe.'
    # Check that columns are numeric
    assert all([data[c].dtype.kind in 'bfiu' for c in col])

    # Drop rows with NaN
    data = data[col].dropna()
    n = data.shape[0]  # Number of samples
    k = data.shape[1] - 2  # Number of covariates
    # dof = n - k - 2
    assert n > 2, 'Data must have at least 3 non-NAN samples.'

    # Standardize (= no need for an intercept in least-square regression)
    C = (data[col] - data[col].mean(axis=0)) / data[col].std(axis=0)

    if covar is not None:
        # PARTIAL CORRELATION
        cvar = np.atleast_2d(C[covar].to_numpy())
        beta_x = np.linalg.lstsq(cvar, C[x].to_numpy(), rcond=None)[0]
        beta_y = np.linalg.lstsq(cvar, C[y].to_numpy(), rcond=None)[0]
        res_x = C[x].to_numpy() - cvar @ beta_x
        res_y = C[y].to_numpy() - cvar @ beta_y
    else:
        # SEMI-PARTIAL CORRELATION
        # Initialize "fake" residuals
        res_x, res_y = data[x].to_numpy(), data[y].to_numpy()
        if x_covar is not None:
            cvar = np.atleast_2d(C[x_covar].to_numpy())
            beta_x = np.linalg.lstsq(cvar, C[x].to_numpy(), rcond=None)[0]
            res_x = C[x].to_numpy() - cvar @ beta_x
        if y_covar is not None:
            cvar = np.atleast_2d(C[y_covar].to_numpy())
            beta_y = np.linalg.lstsq(cvar, C[y].to_numpy(), rcond=None)[0]
            res_y = C[y].to_numpy() - cvar @ beta_y

    # Compute partial correlation coefficient
    # We do not extract the p-values at this stage because they do not account
    # for the number of covariates in the degrees of freedom
    if method == 'pearson':
        r, _ = pearsonr(res_x, res_y)
    elif method == 'spearman':
        r, _ = spearmanr(res_x, res_y, **kwargs)
    elif method == 'kendall':
        r, _ = kendalltau(res_x, res_y, **kwargs)
    elif method == 'bicor':
        r, _ = bicor(res_x, res_y, **kwargs)
    elif method == 'percbend':
        r, _ = percbend(res_x, res_y, **kwargs)
    elif method == 'shepherd':
        r, _, outliers = shepherd(res_x, res_y, **kwargs)
    elif method == 'skipped':
        r, _, outliers = skipped(res_x, res_y, **kwargs)
    else:
        raise ValueError(f'Method "{method}" not recognized.')

    if np.isnan(r):
        # Correlation failed -- new in version v0.3.4, instead of raising an
        # error we just return a dataframe full of NaN (except sample size).
        # This avoid sudden stop in pingouin.pairwise_corr.
        return pd.DataFrame({'n': n, 'r': np.nan, 'CI95%': np.nan,
                             'p-val': np.nan}, index=[method])

    # Sample size after outlier removal
    n_outliers = sum(outliers) if "outliers" in locals() else 0
    n_clean = n - n_outliers

    # Compute the two-sided p-value and confidence intervals
    # https://online.stat.psu.edu/stat505/lesson/6/6.3
    pval = _correl_pvalue(r, n_clean, k)
    ci = compute_esci(
        stat=r, nx=(n_clean - k), ny=(n_clean - k), eftype='r', decimals=6)

    # Create dictionnary
    stats = {
        'n': n,
        'r': r,
        'CI95%': [ci],
        'p-val': pval if tail == 'two-sided' else .5 * pval,
    }

    if method in ['shepherd', 'skipped']:
        stats['outliers'] = n_outliers

    # Convert to DataFrame
    stats = pd.DataFrame.from_records(stats, index=[method])

    # Define order
    col_keep = ['n', 'outliers', 'r', 'CI95%', 'p-val']
    col_order = [k for k in col_keep if k in stats.keys().tolist()]
    return _postprocess_dataframe(stats)[col_order]
Example #7
0
def partial_corr(data=None,
                 x=None,
                 y=None,
                 covar=None,
                 x_covar=None,
                 y_covar=None,
                 alternative='two-sided',
                 method='pearson'):
    """Partial and semi-partial correlation.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        Pandas Dataframe. Note that this function can also directly be used
        as a :py:class:`pandas.DataFrame` method, in which case this argument
        is no longer needed.
    x, y : string
        x and y. Must be names of columns in ``data``.
    covar : string or list
        Covariate(s). Must be a names of columns in ``data``. Use a list if
        there are two or more covariates.
    x_covar : string or list
        Covariate(s) for the ``x`` variable. This is used to compute
        semi-partial correlation (i.e. the effect of ``x_covar`` is removed
        from ``x`` but not from ``y``). Only one of ``covar``,  ``x_covar`` and
        ``y_covar`` can be specified.
    y_covar : string or list
        Covariate(s) for the ``y`` variable. This is used to compute
        semi-partial correlation (i.e. the effect of ``y_covar`` is removed
        from ``y`` but not from ``x``). Only one of ``covar``,  ``x_covar`` and
        ``y_covar`` can be specified.
    alternative : string
        Defines the alternative hypothesis, or tail of the partial correlation. Must be one of
        "two-sided" (default), "greater" or "less". Both "greater" and "less" return a one-sided
        p-value. "greater" tests against the alternative hypothesis that the partial correlation is
        positive (greater than zero), "less" tests against the hypothesis that the partial
        correlation is negative.
    method : string
        Correlation type:

        * ``'pearson'``: Pearson :math:`r` product-moment correlation
        * ``'spearman'``: Spearman :math:`\\rho` rank-order correlation

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'n'``: Sample size (after removal of missing values)
        * ``'r'``: Partial correlation coefficient
        * ``'CI95'``: 95% parametric confidence intervals around :math:`r`
        * ``'p-val'``: p-value

    See also
    --------
    corr, pcorr, pairwise_corr, rm_corr

    Notes
    -----
    Partial correlation [1]_ measures the degree of association between ``x``
    and ``y``, after removing the effect of one or more controlling variables
    (``covar``, or :math:`Z`). Practically, this is achieved by calculating the
    correlation coefficient between the residuals of two linear regressions:

    .. math:: x \\sim Z, y \\sim Z

    Like the correlation coefficient, the partial correlation
    coefficient takes on a value in the range from –1 to 1, where 1 indicates a
    perfect positive association.

    The semipartial correlation is similar to the partial correlation,
    with the exception that the set of controlling variables is only
    removed for either ``x`` or ``y``, but not both.

    Pingouin uses the method described in [2]_ to calculate the (semi)partial
    correlation coefficients and associated p-values. This method is based on
    the inverse covariance matrix and is significantly faster than the
    traditional regression-based method. Results have been tested against the
    `ppcor <https://cran.r-project.org/web/packages/ppcor/index.html>`_
    R package.

    .. important:: Rows with missing values are automatically removed from
        data.

    References
    ----------
    .. [1] https://en.wikipedia.org/wiki/Partial_correlation

    .. [2] https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4681537/

    Examples
    --------
    1. Partial correlation with one covariate

    >>> import pingouin as pg
    >>> df = pg.read_dataset('partial_corr')
    >>> pg.partial_corr(data=df, x='x', y='y', covar='cv1').round(3)
              n      r         CI95%  p-val
    pearson  30  0.568  [0.25, 0.77]  0.001

    2. Spearman partial correlation with several covariates

    >>> # Partial correlation of x and y controlling for cv1, cv2 and cv3
    >>> pg.partial_corr(data=df, x='x', y='y', covar=['cv1', 'cv2', 'cv3'],
    ...                 method='spearman').round(3)
               n      r         CI95%  p-val
    spearman  30  0.521  [0.18, 0.75]  0.005

    3. Same but one-sided test

    >>> pg.partial_corr(data=df, x='x', y='y', covar=['cv1', 'cv2', 'cv3'],
    ...                 alternative="greater", method='spearman').round(3)
               n      r        CI95%  p-val
    spearman  30  0.521  [0.24, 1.0]  0.003

    >>> pg.partial_corr(data=df, x='x', y='y', covar=['cv1', 'cv2', 'cv3'],
    ...                 alternative="less", method='spearman').round(3)
               n      r         CI95%  p-val
    spearman  30  0.521  [-1.0, 0.72]  0.997

    4. As a pandas method

    >>> df.partial_corr(x='x', y='y', covar=['cv1'], method='spearman').round(3)
               n      r         CI95%  p-val
    spearman  30  0.578  [0.27, 0.78]  0.001

    5. Partial correlation matrix (returns only the correlation coefficients)

    >>> df.pcorr().round(3)
             x      y    cv1    cv2    cv3
    x    1.000  0.493 -0.095  0.130 -0.385
    y    0.493  1.000 -0.007  0.104 -0.002
    cv1 -0.095 -0.007  1.000 -0.241 -0.470
    cv2  0.130  0.104 -0.241  1.000 -0.118
    cv3 -0.385 -0.002 -0.470 -0.118  1.000

    6. Semi-partial correlation on x

    >>> pg.partial_corr(data=df, x='x', y='y', x_covar=['cv1', 'cv2', 'cv3']).round(3)
              n      r        CI95%  p-val
    pearson  30  0.463  [0.1, 0.72]  0.015
    """
    from pingouin.utils import _flatten_list
    # Safety check
    assert alternative in [
        'two-sided', 'greater', 'less'
    ], ("Alternative must be one of 'two-sided' (default), 'greater' or 'less'."
        )
    assert method in [
        'pearson', 'spearman'
    ], ('only "pearson" and "spearman" are supported for partial correlation.')
    assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.'
    assert data.shape[0] > 2, 'Data must have at least 3 samples.'
    if covar is not None and (x_covar is not None or y_covar is not None):
        raise ValueError('Cannot specify both covar and {x,y}_covar.')
    if x_covar is not None and y_covar is not None:
        raise ValueError('Cannot specify both x_covar and y_covar.')
    assert x != covar, 'x and covar must be independent'
    assert y != covar, 'y and covar must be independent'
    assert x != y, 'x and y must be independent'
    if isinstance(covar, list):
        assert x not in covar, 'x and covar must be independent'
        assert y not in covar, 'y and covar must be independent'
    # Check that columns exist
    col = _flatten_list([x, y, covar, x_covar, y_covar])
    assert all([c in data for c in col]), 'columns are not in dataframe.'
    # Check that columns are numeric
    assert all([data[c].dtype.kind in 'bfiu' for c in col])

    # Drop rows with NaN
    data = data[col].dropna()
    n = data.shape[0]  # Number of samples
    k = data.shape[1] - 2  # Number of covariates
    assert n > 2, 'Data must have at least 3 non-NAN samples.'

    # Calculate the partial corrrelation matrix - similar to pingouin.pcorr()
    if method == "spearman":
        # Convert the data to rank, similar to R cov()
        V = data.rank(na_option='keep').cov()
    else:
        V = data.cov()
    Vi = np.linalg.pinv(V, hermitian=True)  # Inverse covariance matrix
    Vi_diag = Vi.diagonal()
    D = np.diag(np.sqrt(1 / Vi_diag))
    pcor = -1 * (D @ Vi @ D)  # Partial correlation matrix

    if covar is not None:
        r = pcor[0, 1]
    else:
        # Semi-partial correlation matrix
        with np.errstate(divide='ignore'):
            spcor = pcor / \
                np.sqrt(np.diag(V))[..., None] / \
                np.sqrt(np.abs(Vi_diag - Vi ** 2 / Vi_diag[..., None])).T
        if y_covar is not None:
            r = spcor[0, 1]  # y_covar is removed from y
        else:
            r = spcor[1, 0]  # x_covar is removed from x

    if np.isnan(r):
        # Correlation failed. Return NaN. When would this happen?
        return pd.DataFrame(
            {
                'n': n,
                'r': np.nan,
                'CI95%': np.nan,
                'p-val': np.nan
            },
            index=[method])

    # Compute the two-sided p-value and confidence intervals
    # https://online.stat.psu.edu/stat505/lesson/6/6.3
    pval = _correl_pvalue(r, n, k, alternative)
    ci = compute_esci(stat=r,
                      nx=(n - k),
                      ny=(n - k),
                      eftype='r',
                      decimals=6,
                      alternative=alternative)

    # Create dictionnary
    stats = {
        'n': n,
        'r': r,
        'CI95%': [ci],
        'p-val': pval,
    }

    # Convert to DataFrame
    stats = pd.DataFrame(stats, index=[method])

    # Define order
    col_keep = ['n', 'r', 'CI95%', 'p-val']
    col_order = [k for k in col_keep if k in stats.keys().tolist()]
    return _postprocess_dataframe(stats)[col_order]
Example #8
0
def normality(data, dv=None, group=None, method="shapiro", alpha=.05):
    """Univariate normality test.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`, series, list or 1D np.array
        Iterable. Can be either a single list, 1D numpy array,
        or a wide- or long-format pandas dataframe.
    dv : str
        Dependent variable (only when ``data`` is a long-format dataframe).
    group : str
        Grouping variable (only when ``data`` is a long-format dataframe).
    method : str
        Normality test. `'shapiro'` (default) performs the Shapiro-Wilk test
        using :py:func:`scipy.stats.shapiro`, and `'normaltest'` performs the
        omnibus test of normality using :py:func:`scipy.stats.normaltest`.
        The latter is more appropriate for large samples.
    alpha : float
        Significance level.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'W'``: Test statistic.
        * ``'pval'``: p-value.
        * ``'normal'``: True if ``data`` is normally distributed.

    See Also
    --------
    homoscedasticity : Test equality of variance.
    sphericity : Mauchly's test for sphericity.

    Notes
    -----
    The Shapiro-Wilk test calculates a :math:`W` statistic that tests whether a
    random sample :math:`x_1, x_2, ..., x_n` comes from a normal distribution.

    The :math:`W` statistic is calculated as follows:

    .. math::

        W = \\frac{(\\sum_{i=1}^n a_i x_{i})^2}
        {\\sum_{i=1}^n (x_i - \\overline{x})^2}

    where the :math:`x_i` are the ordered sample values (in ascending
    order) and the :math:`a_i` are constants generated from the means,
    variances and covariances of the order statistics of a sample of size
    :math:`n` from a standard normal distribution. Specifically:

    .. math:: (a_1, ..., a_n) = \\frac{m^TV^{-1}}{(m^TV^{-1}V^{-1}m)^{1/2}}

    with :math:`m = (m_1, ..., m_n)^T` and :math:`(m_1, ..., m_n)` are the
    expected values of the order statistics of independent and identically
    distributed random variables sampled from the standard normal distribution,
    and :math:`V` is the covariance matrix of those order statistics.

    The null-hypothesis of this test is that the population is normally
    distributed. Thus, if the p-value is less than the
    chosen alpha level (typically set at 0.05), then the null hypothesis is
    rejected and there is evidence that the data tested are not normally
    distributed.

    The result of the Shapiro-Wilk test should be interpreted with caution in
    the case of large sample sizes. Indeed, quoting from
    `Wikipedia <https://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test>`_:

        *"Like most statistical significance tests, if the sample size is
        sufficiently large this test may detect even trivial departures from
        the null hypothesis (i.e., although there may be some statistically
        significant effect, it may be too small to be of any practical
        significance); thus, additional investigation of the effect size is
        typically advisable, e.g., a Q–Q plot in this case."*

    Note that missing values are automatically removed (casewise deletion).

    References
    ----------
    * Shapiro, S. S., & Wilk, M. B. (1965). An analysis of variance test
      for normality (complete samples). Biometrika, 52(3/4), 591-611.

    * https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm

    Examples
    --------
    1. Shapiro-Wilk test on a 1D array.

    >>> import numpy as np
    >>> import pingouin as pg
    >>> np.random.seed(123)
    >>> x = np.random.normal(size=100)
    >>> pg.normality(x)
             W      pval  normal
    0  0.98414  0.274886    True

    2. Omnibus test on a wide-format dataframe with missing values

    >>> data = pg.read_dataset('mediation')
    >>> data.loc[1, 'X'] = np.nan
    >>> pg.normality(data, method='normaltest').round(3)
                W   pval  normal
    X       1.792  0.408    True
    M       0.492  0.782    True
    Y       0.349  0.840    True
    Mbin  839.716  0.000   False
    Ybin  814.468  0.000   False
    W1     24.816  0.000   False
    W2     43.400  0.000   False

    3. Pandas Series

    >>> pg.normality(data['X'], method='normaltest')
              W      pval  normal
    X  1.791839  0.408232    True

    4. Long-format dataframe

    >>> data = pg.read_dataset('rm_anova2')
    >>> pg.normality(data, dv='Performance', group='Time')
                 W      pval  normal
    Pre   0.967718  0.478773    True
    Post  0.940728  0.095157    True
    """
    assert isinstance(data, (pd.DataFrame, pd.Series, list, np.ndarray))
    assert method in ['shapiro', 'normaltest']
    if isinstance(data, pd.Series):
        data = data.to_frame()
    col_names = ['W', 'pval']
    func = getattr(scipy.stats, method)
    if isinstance(data, (list, np.ndarray)):
        data = np.asarray(data)
        assert data.ndim == 1, 'Data must be 1D.'
        assert data.size > 3, 'Data must have more than 3 samples.'
        data = remove_na(data)
        stats = pd.DataFrame(func(data)).T
        stats.columns = col_names
        stats['normal'] = np.where(stats['pval'] > alpha, True, False)
    else:
        # Data is a Pandas DataFrame
        if dv is None and group is None:
            # Wide-format
            # Get numeric data only
            numdata = data._get_numeric_data()
            stats = numdata.apply(lambda x: func(x.dropna()),
                                  result_type='expand',
                                  axis=0).T
            stats.columns = col_names
            stats['normal'] = np.where(stats['pval'] > alpha, True, False)
        else:
            # Long-format
            stats = pd.DataFrame([])
            assert group in data.columns
            assert dv in data.columns
            grp = data.groupby(group, observed=True, sort=False)
            cols = grp.groups.keys()
            for _, tmp in grp:
                stats = stats.append(
                    normality(tmp[dv].to_numpy(), method=method, alpha=alpha))
            stats.index = cols
    return _postprocess_dataframe(stats)
Example #9
0
def box_m(data, dvs, group, alpha=.001):
    """Test equality of covariance matrices using the Box's M test.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        Long-format dataframe.
    dvs : list
        Dependent variables.
    group : str
        Grouping variable.
    alpha : float
        Significance level. Default is 0.001 as recommended in [2]_. A
        non-significant p-value (higher than alpha) indicates that the
        covariance matrices are homogenous (= equal).

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'Chi2'``: Test statistic
        * ``'pval'``: p-value
        * ``'df'``: The Chi-Square statistic's degree of freedom
        * ``'equal_cov'``: True if ``data`` has equal covariance

    Notes
    -----
    .. warning:: Box's M test is susceptible to errors if the data does not
        meet the assumption of multivariate normality or if the sample size is
        too large or small [3]_.

    Pingouin uses :py:meth:`pandas.DataFrameGroupBy.cov` to calculate the
    variance-covariance matrix of each group. Missing values are automatically
    excluded from the calculation by Pandas.

    Mathematical expressions can be found in [1]_.

    This function has been tested against the boxM package of the `biotools`
    R package [4]_.

    References
    ----------
    .. [1] Rencher, A. C. (2003). Methods of multivariate analysis (Vol. 492).
           John Wiley & Sons.

    .. [2] Hahs-Vaughn, D. (2016). Applied Multivariate Statistical Concepts.
           Taylor & Francis.

    .. [3] https://en.wikipedia.org/wiki/Box%27s_M_test

    .. [4] https://cran.r-project.org/web/packages/biotools/index.html

    Examples
    --------
    1. Box M test with 3 dependent variables of 4 groups (equal sample size)

    >>> import pandas as pd
    >>> import pingouin as pg
    >>> from scipy.stats import multivariate_normal as mvn
    >>> data = pd.DataFrame(mvn.rvs(size=(100, 3), random_state=42),
    ...                     columns=['A', 'B', 'C'])
    >>> data['group'] = [1] * 25 + [2] * 25 + [3] * 25 + [4] * 25
    >>> data.head()
              A         B         C  group
    0  0.496714 -0.138264  0.647689      1
    1  1.523030 -0.234153 -0.234137      1
    2  1.579213  0.767435 -0.469474      1
    3  0.542560 -0.463418 -0.465730      1
    4  0.241962 -1.913280 -1.724918      1

    >>> pg.box_m(data, dvs=['A', 'B', 'C'], group='group')
              Chi2    df      pval  equal_cov
    box  11.634185  18.0  0.865537       True

    2. Box M test with 3 dependent variables of 2 groups (unequal sample size)

    >>> data = pd.DataFrame(mvn.rvs(size=(30, 2), random_state=42),
    ...                     columns=['A', 'B'])
    >>> data['group'] = [1] * 20 + [2] * 10
    >>> pg.box_m(data, dvs=['A', 'B'], group='group')
             Chi2   df      pval  equal_cov
    box  0.706709  3.0  0.871625       True
    """
    # Safety checks
    from scipy.stats import chi2
    assert isinstance(data, pd.DataFrame), "data must be a pandas dataframe."
    assert group in data.columns, "The grouping variable is not in data."
    assert set(dvs).issubset(data.columns), "The DVs are not in data."
    grp = data.groupby(group, observed=True)[dvs]
    assert grp.ngroups > 1, 'Data must have at least two columns.'

    # Calculate covariance matrix and descriptive statistics
    # - n_covs is the number of covariance matrices
    # - n_dvs is the number of variables
    # - n_samp is the number of samples in each covariance matrix
    # - nobs is the total number of observations
    covs = grp.cov()
    n_covs, n_dvs = covs.index.levshape
    n_samp = grp.count().iloc[:, 0].to_numpy()  # NaN are excluded by .count
    nobs = n_samp.sum()
    v = n_samp - 1

    # Calculate pooled covariance matrix (S) and M statistics
    covs = covs.to_numpy().reshape(n_covs, n_dvs, -1)
    S = (covs * v[..., None, None]).sum(axis=0) / (nobs - n_covs)
    # The following lines might raise an error if the covariance matrices are
    # not invertible (e.g. missing values in input).
    S_det = np.linalg.det(S)
    M = ((np.linalg.det(covs) / S_det)**(v / 2)).prod()

    # Calculate C in reference [1] (page 257-259)
    if len(np.unique(n_samp)) == 1:
        # All groups have same number of samples
        c = ((n_covs + 1) * (2 * n_dvs ** 2 + 3 * n_dvs - 1)) \
            / (6 * n_covs * (n_dvs + 1) * (nobs / n_covs - 1))
    else:
        # Unequal sample size
        c = (2 * n_dvs**2 + 3 * n_dvs - 1) / (6 * (n_dvs + 1) * (n_covs - 1))
        c *= ((1 / v).sum() - 1 / v.sum())

    # Calculate U statistics and degree of fredom
    u = -2 * (1 - c) * np.log(M)
    df = 0.5 * n_dvs * (n_dvs + 1) * (n_covs - 1)
    p = chi2.sf(u, df)
    equal_cov = True if p > alpha else False
    stats = pd.DataFrame(index=["box"],
                         data={
                             'Chi2': [u],
                             'df': [df],
                             'pval': [p],
                             'equal_cov': [equal_cov]
                         })
    return _postprocess_dataframe(stats)
Example #10
0
def homoscedasticity(data, dv=None, group=None, method="levene", alpha=.05):
    """Test equality of variance.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`, list or dict
        Iterable. Can be either a list / dictionnary of iterables
        or a wide- or long-format pandas dataframe.
    dv : str
        Dependent variable (only when ``data`` is a long-format dataframe).
    group : str
        Grouping variable (only when ``data`` is a long-format dataframe).
    method : str
        Statistical test. `'levene'` (default) performs the Levene test
        using :py:func:`scipy.stats.levene`, and `'bartlett'` performs the
        Bartlett test using :py:func:`scipy.stats.bartlett`.
        The former is more robust to departure from normality.
    alpha : float
        Significance level.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'W/T'``: Test statistic ('W' for Levene, 'T' for Bartlett)
        * ``'pval'``: p-value
        * ``'equal_var'``: True if ``data`` has equal variance

    See Also
    --------
    normality : Univariate normality test.
    sphericity : Mauchly's test for sphericity.

    Notes
    -----
    The **Bartlett** :math:`T` statistic [1]_ is defined as:

    .. math::

        T = \\frac{(N-k) \\ln{s^{2}_{p}} - \\sum_{i=1}^{k}(N_{i} - 1)
        \\ln{s^{2}_{i}}}{1 + (1/(3(k-1)))((\\sum_{i=1}^{k}{1/(N_{i} - 1))}
        - 1/(N-k))}

    where :math:`s_i^2` is the variance of the :math:`i^{th}` group,
    :math:`N` is the total sample size, :math:`N_i` is the sample size of the
    :math:`i^{th}` group, :math:`k` is the number of groups,
    and :math:`s_p^2` is the pooled variance.

    The pooled variance is a weighted average of the group variances and is
    defined as:

    .. math:: s^{2}_{p} = \\sum_{i=1}^{k}(N_{i} - 1)s^{2}_{i}/(N-k)

    The p-value is then computed using a chi-square distribution:

    .. math:: T \\sim \\chi^2(k-1)

    The **Levene** :math:`W` statistic [2]_ is defined as:

    .. math::

        W = \\frac{(N-k)} {(k-1)}
        \\frac{\\sum_{i=1}^{k}N_{i}(\\overline{Z}_{i.}-\\overline{Z})^{2} }
        {\\sum_{i=1}^{k}\\sum_{j=1}^{N_i}(Z_{ij}-\\overline{Z}_{i.})^{2} }

    where :math:`Z_{ij} = |Y_{ij} - \\text{median}({Y}_{i.})|`,
    :math:`\\overline{Z}_{i.}` are the group means of :math:`Z_{ij}` and
    :math:`\\overline{Z}` is the grand mean of :math:`Z_{ij}`.

    The p-value is then computed using a F-distribution:

    .. math:: W \\sim F(k-1, N-k)

    .. warning:: Missing values are not supported for this function.
        Make sure to remove them before using the
        :py:meth:`pandas.DataFrame.dropna` or :py:func:`pingouin.remove_na`
        functions.

    References
    ----------
    .. [1] Bartlett, M. S. (1937). Properties of sufficiency and statistical
           tests. Proc. R. Soc. Lond. A, 160(901), 268-282.

    .. [2] Brown, M. B., & Forsythe, A. B. (1974). Robust tests for the
           equality of variances. Journal of the American Statistical
           Association, 69(346), 364-367.

    Examples
    --------
    1. Levene test on a wide-format dataframe

    >>> import numpy as np
    >>> import pingouin as pg
    >>> data = pg.read_dataset('mediation')
    >>> pg.homoscedasticity(data[['X', 'Y', 'M']])
                   W      pval  equal_var
    levene  0.434861  0.999997       True

    2. Bartlett test using a list of iterables

    >>> data = [[4, 8, 9, 20, 14], np.array([5, 8, 15, 45, 12])]
    >>> pg.homoscedasticity(data, method="bartlett", alpha=.05)
                     T      pval  equal_var
    bartlett  2.873569  0.090045       True

    3. Long-format dataframe

    >>> data = pg.read_dataset('rm_anova2')
    >>> pg.homoscedasticity(data, dv='Performance', group='Time')
                   W      pval  equal_var
    levene  3.192197  0.079217       True
    """
    assert isinstance(data, (pd.DataFrame, list, dict))
    assert method.lower() in ['levene', 'bartlett']
    func = getattr(scipy.stats, method)
    if isinstance(data, pd.DataFrame):
        # Data is a Pandas DataFrame
        if dv is None and group is None:
            # Wide-format
            # Get numeric data only
            numdata = data._get_numeric_data()
            assert numdata.shape[1] > 1, 'Data must have at least two columns.'
            statistic, p = func(*numdata.to_numpy())
        else:
            # Long-format
            assert group in data.columns
            assert dv in data.columns
            grp = data.groupby(group, observed=True)[dv]
            assert grp.ngroups > 1, 'Data must have at least two columns.'
            statistic, p = func(*grp.apply(list))
    elif isinstance(data, list):
        # Check that list contains other list or np.ndarray
        assert all(isinstance(el, (list, np.ndarray)) for el in data)
        assert len(data) > 1, 'Data must have at least two iterables.'
        statistic, p = func(*data)
    else:
        # Data is a dict
        assert all(isinstance(el, (list, np.ndarray)) for el in data.values())
        assert len(data) > 1, 'Data must have at least two iterables.'
        statistic, p = func(*data.values())

    equal_var = True if p > alpha else False
    stat_name = 'W' if method.lower() == 'levene' else 'T'

    stats = pd.DataFrame(
        {
            stat_name: statistic,
            'pval': p,
            'equal_var': equal_var
        },
        index=[method])

    return _postprocess_dataframe(stats)
Example #11
0
def mediation_analysis(data=None, x=None, m=None, y=None, covar=None,
                       alpha=0.05, n_boot=500, seed=None, return_dist=False):
    """Mediation analysis using a bias-correct non-parametric bootstrap method.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        Dataframe.
    x : str
        Column name in data containing the predictor variable.
        The predictor variable must be continuous.
    m : str or list of str
        Column name(s) in data containing the mediator variable(s).
        The mediator(s) can be continuous or binary (e.g. 0 or 1).
        This function supports multiple parallel mediators.
    y : str
        Column name in data containing the outcome variable.
        The outcome variable must be continuous.
    covar : None, str, or list
        Covariate(s). If not None, the specified covariate(s) will be included
        in all regressions.
    alpha : float
        Significance threshold. Used to determine the confidence interval,
        :math:`\\text{CI} = [\\alpha / 2 ; 1 - \\alpha / 2]`.
    n_boot : int
        Number of bootstrap iterations for confidence intervals and p-values
        estimation. The greater, the slower.
    seed : int or None
        Random state seed.
    return_dist : bool
        If True, the function also returns the indirect bootstrapped beta
        samples (size = n_boot). Can be plotted for instance using
        :py:func:`seaborn.distplot()` or :py:func:`seaborn.kdeplot()`
        functions.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`
        Mediation summary:

        * ``'path'``: regression model
        * ``'coef'``: regression estimates
        * ``'se'``: standard error
        * ``'CI[2.5%]'``: lower confidence interval
        * ``'CI[97.5%]'``: upper confidence interval
        * ``'pval'``: two-sided p-values
        * ``'sig'``: statistical significance

    See also
    --------
    linear_regression, logistic_regression

    Notes
    -----
    Mediation analysis [1]_ is a *"statistical procedure to test
    whether the effect of an independent variable X on a dependent variable
    Y (i.e., X → Y) is at least partly explained by a chain of effects of the
    independent variable on an intervening mediator variable M and of the
    intervening variable on the dependent variable (i.e., X → M → Y)"* [2]_.

    The **indirect effect** (also referred to as average causal mediation
    effect or ACME) of X on Y through mediator M quantifies the estimated
    difference in Y resulting from a one-unit change in X through a sequence of
    causal steps in which X affects M, which in turn affects Y.
    It is considered significant if the specified confidence interval does not
    include 0. The path 'X --> Y' is the sum of both the indirect and direct
    effect. It is sometimes referred to as total effect.

    A linear regression is used if the mediator variable is continuous and a
    logistic regression if the mediator variable is dichotomous (binary).
    Multiple parallel mediators are also supported.

    This function wll only work well if the outcome variable is continuous.
    It does not support binary or ordinal outcome variable. For more
    advanced mediation models, please refer to the
    `lavaan <http://lavaan.ugent.be/tutorial/mediation.html>`_ or  `mediation
    <https://cran.r-project.org/web/packages/mediation/mediation.pdf>`_ R
    packages, or the `PROCESS macro
    <https://www.processmacro.org/index.html>`_ for SPSS.

    The two-sided p-value of the indirect effect is computed using the
    bootstrap distribution, as in the mediation R package. However, the p-value
    should be interpreted with caution since it is not constructed
    conditioned on a true null hypothesis [3]_ and varies depending on the
    number of bootstrap samples and the random seed.

    Note that rows with missing values are automatically removed.

    Results have been tested against the R mediation package and this tutorial
    https://data.library.virginia.edu/introduction-to-mediation-analysis/

    References
    ----------
    .. [1] Baron, R. M. & Kenny, D. A. The moderator–mediator variable
           distinction in social psychological research: Conceptual, strategic,
           and statistical considerations. J. Pers. Soc. Psychol. 51, 1173–1182
           (1986).

    .. [2] Fiedler, K., Schott, M. & Meiser, T. What mediation analysis can
           (not) do. J. Exp. Soc. Psychol. 47, 1231–1236 (2011).


    .. [3] Hayes, A. F. & Rockwood, N. J. Regression-based statistical
           mediation and moderation analysis in clinical research:
           Observations, recommendations, and implementation. Behav. Res.
           Ther. 98, 39–57 (2017).

    Code originally adapted from https://github.com/rmill040/pymediation.

    Examples
    --------
    1. Simple mediation analysis

    >>> from pingouin import mediation_analysis, read_dataset
    >>> df = read_dataset('mediation')
    >>> mediation_analysis(data=df, x='X', m='M', y='Y', alpha=0.05,
    ...                    seed=42)
           path      coef        se          pval  CI[2.5%]  CI[97.5%]  sig
    0     M ~ X  0.561015  0.094480  4.391362e-08  0.373522   0.748509  Yes
    1     Y ~ M  0.654173  0.085831  1.612674e-11  0.483844   0.824501  Yes
    2     Total  0.396126  0.111160  5.671128e-04  0.175533   0.616719  Yes
    3    Direct  0.039604  0.109648  7.187429e-01 -0.178018   0.257226   No
    4  Indirect  0.356522  0.083313  0.000000e+00  0.219818   0.537654  Yes

    2. Return the indirect bootstrapped beta coefficients

    >>> stats, dist = mediation_analysis(data=df, x='X', m='M', y='Y',
    ...                                  return_dist=True)
    >>> print(dist.shape)
    (500,)

    3. Mediation analysis with a binary mediator variable

    >>> mediation_analysis(data=df, x='X', m='Mbin', y='Y', seed=42).round(3)
           path   coef     se   pval  CI[2.5%]  CI[97.5%]  sig
    0  Mbin ~ X -0.021  0.116  0.857    -0.248      0.206   No
    1  Y ~ Mbin -0.135  0.412  0.743    -0.952      0.682   No
    2     Total  0.396  0.111  0.001     0.176      0.617  Yes
    3    Direct  0.396  0.112  0.001     0.174      0.617  Yes
    4  Indirect  0.002  0.050  0.960    -0.072      0.146   No

    4. Mediation analysis with covariates

    >>> mediation_analysis(data=df, x='X', m='M', y='Y',
    ...                    covar=['Mbin', 'Ybin'], seed=42).round(3)
           path   coef     se   pval  CI[2.5%]  CI[97.5%]  sig
    0     M ~ X  0.559  0.097  0.000     0.367      0.752  Yes
    1     Y ~ M  0.666  0.086  0.000     0.495      0.837  Yes
    2     Total  0.420  0.113  0.000     0.196      0.645  Yes
    3    Direct  0.064  0.110  0.561    -0.155      0.284   No
    4  Indirect  0.356  0.086  0.000     0.209      0.553  Yes

    5. Mediation analysis with multiple parallel mediators

    >>> mediation_analysis(data=df, x='X', m=['M', 'Mbin'], y='Y',
    ...                    seed=42).round(3)
                path   coef     se   pval  CI[2.5%]  CI[97.5%]  sig
    0          M ~ X  0.561  0.094  0.000     0.374      0.749  Yes
    1       Mbin ~ X -0.005  0.029  0.859    -0.063      0.052   No
    2          Y ~ M  0.654  0.086  0.000     0.482      0.825  Yes
    3       Y ~ Mbin -0.064  0.328  0.846    -0.715      0.587   No
    4          Total  0.396  0.111  0.001     0.176      0.617  Yes
    5         Direct  0.040  0.110  0.721    -0.179      0.258   No
    6     Indirect M  0.356  0.085  0.000     0.215      0.538  Yes
    7  Indirect Mbin  0.000  0.010  0.952    -0.017      0.025   No
    """
    # Sanity check
    assert isinstance(x, str), 'y must be a string.'
    assert isinstance(y, str), 'y must be a string.'
    assert isinstance(m, (list, str)), 'Mediator(s) must be a list or string.'
    assert isinstance(covar, (type(None), str, list))
    if isinstance(m, str):
        m = [m]
    n_mediator = len(m)
    assert isinstance(data, pd.DataFrame), 'Data must be a DataFrame.'
    # Check for duplicates
    assert n_mediator == len(set(m)), 'Cannot have duplicates mediators.'
    if isinstance(covar, str):
        covar = [covar]
    if isinstance(covar, list):
        assert len(covar) == len(set(covar)), 'Cannot have duplicates covar.'
        assert set(m).isdisjoint(covar), 'Mediator cannot be in covar.'
    # Check that columns are in dataframe
    columns = _fl([x, m, y, covar])
    keys = data.columns
    assert all([c in keys for c in columns]), 'Column(s) are not in DataFrame.'
    # Check that columns are numeric
    err_msg = "Columns must be numeric or boolean."
    assert all([data[c].dtype.kind in 'bfiu' for c in columns]), err_msg

    # Drop rows with NAN Values
    data = data[columns].dropna()
    n = data.shape[0]
    assert n > 5, 'DataFrame must have at least 5 samples (rows).'

    # Check if mediator is binary
    mtype = 'logistic' if all(data[m].nunique() == 2) else 'linear'

    # Name of CI
    ll_name = 'CI[%.1f%%]' % (100 * alpha / 2)
    ul_name = 'CI[%.1f%%]' % (100 * (1 - alpha / 2))

    # Compute regressions
    cols = ['names', 'coef', 'se', 'pval', ll_name, ul_name]

    # For speed, we pass np.array instead of pandas DataFrame
    X_val = data[_fl([x, covar])].to_numpy()  # X + covar as predictors
    XM_val = data[_fl([x, m, covar])].to_numpy()  # X + M + covar as predictors
    M_val = data[m].to_numpy()  # M as target (no covariates)
    y_val = data[y].to_numpy()  # y as target (no covariates)

    # For max precision, make sure rounding is disabled
    old_options = options.copy()
    options['round'] = None

    # M(j) ~ X + covar
    sxm = {}
    for idx, j in enumerate(m):
        if mtype == 'linear':
            sxm[j] = linear_regression(X_val, M_val[:, idx],
                                       alpha=alpha).loc[[1], cols]
        else:
            sxm[j] = logistic_regression(X_val, M_val[:, idx],
                                         alpha=alpha).loc[[1], cols]
        sxm[j].at[1, 'names'] = '%s ~ X' % j
    sxm = pd.concat(sxm, ignore_index=True)

    # Y ~ M + covar
    smy = linear_regression(data[_fl([m, covar])], y_val,
                            alpha=alpha).loc[1:n_mediator, cols]

    # Average Total Effects (Y ~ X + covar)
    sxy = linear_regression(X_val, y_val, alpha=alpha).loc[[1], cols]

    # Average Direct Effects (Y ~ X + M + covar)
    direct = linear_regression(XM_val, y_val, alpha=alpha).loc[[1], cols]

    # Rename paths
    smy['names'] = smy['names'].apply(lambda x: 'Y ~ %s' % x)
    direct.at[1, 'names'] = 'Direct'
    sxy.at[1, 'names'] = 'Total'

    # Concatenate and create sig column
    stats = pd.concat((sxm, smy, sxy, direct), ignore_index=True)
    stats['sig'] = np.where(stats['pval'] < alpha, 'Yes', 'No')

    # Bootstrap confidence intervals
    rng = np.random.RandomState(seed)
    idx = rng.choice(np.arange(n), replace=True, size=(n_boot, n))
    ab_estimates = np.zeros(shape=(n_boot, n_mediator))
    for i in range(n_boot):
        ab_estimates[i, :] = _point_estimate(X_val, XM_val, M_val, y_val,
                                             idx[i, :], n_mediator, mtype)

    ab = _point_estimate(X_val, XM_val, M_val, y_val, np.arange(n),
                         n_mediator, mtype)
    indirect = {'names': m, 'coef': ab, 'se': ab_estimates.std(ddof=1, axis=0),
                'pval': [], ll_name: [], ul_name: [], 'sig': []}

    for j in range(n_mediator):
        ci_j = _bca(ab_estimates[:, j], indirect['coef'][j],
                    alpha=alpha, n_boot=n_boot)
        indirect[ll_name].append(min(ci_j))
        indirect[ul_name].append(max(ci_j))
        # Bootstrapped p-value of indirect effect
        # Note that this is less accurate than a permutation test because the
        # bootstrap distribution is not conditioned on a true null hypothesis.
        # For more details see Hayes and Rockwood 2017
        indirect['pval'].append(_pval_from_bootci(ab_estimates[:, j],
                                indirect['coef'][j]))
        indirect['sig'].append('Yes' if indirect['pval'][j] < alpha else 'No')

    # Create output dataframe
    indirect = pd.DataFrame.from_dict(indirect)
    if n_mediator == 1:
        indirect['names'] = 'Indirect'
    else:
        indirect['names'] = indirect['names'].apply(lambda x:
                                                    'Indirect %s' % x)
    stats = stats.append(indirect, ignore_index=True)
    stats = stats.rename(columns={'names': 'path'})

    # Restore options
    options.update(old_options)

    if return_dist:
        return _postprocess_dataframe(stats), np.squeeze(ab_estimates)
    else:
        return _postprocess_dataframe(stats)
Example #12
0
def logistic_regression(X, y, coef_only=False, alpha=0.05,
                        as_dataframe=True, remove_na=False, **kwargs):
    """(Multiple) Binary logistic regression.

    Parameters
    ----------
    X : array_like
        Predictor(s), of shape *(n_samples, n_features)* or *(n_samples)*.
    y : array_like
        Dependent variable, of shape *(n_samples)*.
        ``y`` must be binary, i.e. only contains 0 or 1. Multinomial logistic
        regression is not supported.
    coef_only : bool
        If True, return only the regression coefficients.
    alpha : float
        Alpha value used for the confidence intervals.
        :math:`\\text{CI} = [\\alpha / 2 ; 1 - \\alpha / 2]`
    as_dataframe : bool
        If True, returns a pandas DataFrame. If False, returns a dictionnary.
    remove_na : bool
        If True, apply a listwise deletion of missing values (i.e. the entire
        row is removed). Default is False, which will raise an error if missing
        values are present in either the predictor(s) or dependent
        variable.
    **kwargs : optional
        Optional arguments passed to
        :py:class:`sklearn.linear_model.LogisticRegression` (see Notes).

    Returns
    -------
    stats : :py:class:`pandas.DataFrame` or dict
        Logistic regression summary:

        * ``'names'``: name of variable(s) in the model (e.g. x1, x2...)
        * ``'coef'``: regression coefficients (log-odds)
        * ``'se'``: standard error
        * ``'z'``: z-scores
        * ``'pval'``: two-tailed p-values
        * ``'CI[2.5%]'``: lower confidence interval
        * ``'CI[97.5%]'``: upper confidence interval

    See also
    --------
    linear_regression

    Notes
    -----
    .. caution:: This function is a wrapper around the
        :py:class:`sklearn.linear_model.LogisticRegression` class. However,
        Pingouin internally disables the L2 regularization and changes the
        default solver in order to get results that are similar to R and
        statsmodels.

    The logistic regression assumes that the log-odds (the logarithm of the
    odds) for the value labeled "1" in the response variable is a linear
    combination of the predictor variables. The log-odds are given by the
    `logit <https://en.wikipedia.org/wiki/Logit>`_ function,
    which map a probability :math:`p` of the response variable being "1"
    from :math:`[0, 1)` to :math:`(-\\infty, +\\infty)`.

    .. math:: \\text{logit}(p) = \\ln \\frac{p}{1 - p} = \\beta_0 + \\beta X

    The odds of the response variable being "1" can be obtained by
    exponentiating the log-odds:

    .. math:: \\frac{p}{1 - p} = e^{\\beta_0 + \\beta X}

    and the probability of the response variable being "1" is given by the
    `logistic function <https://en.wikipedia.org/wiki/Logistic_function>`_:

    .. math:: p = \\frac{1}{1 + e^{-(\\beta_0 + \\beta X})}

    The first coefficient is always the constant term (intercept) of
    the model. Pingouin will automatically add the intercept
    to your predictor(s) matrix, therefore, :math:`X` should not include a
    constant term. Pingouin will remove any constant term (e.g column with only
    one unique value), or duplicate columns from :math:`X`.

    The calculation of the p-values and confidence interval is adapted from a
    `code by Rob Speare
    <https://gist.github.com/rspeare/77061e6e317896be29c6de9a85db301d>`_.
    Results have been compared against statsmodels, R, and JASP.

    Examples
    --------
    1. Simple binary logistic regression.

    In this first example, we'll use the
    `penguins dataset <https://github.com/allisonhorst/palmerpenguins>`_
    to see how well we can predict the sex of penguins based on their
    bodies mass.

    >>> import numpy as np
    >>> import pandas as pd
    >>> import pingouin as pg
    >>> df = pg.read_dataset('penguins')
    >>> # Let's first convert the target variable from string to boolean:
    >>> df['male'] = (df['sex'] == 'male').astype(int)  # male: 1, female: 0
    >>> # Since there are missing values in our outcome variable, we need to
    >>> # set `remove_na=True` otherwise regression will fail.
    >>> lom = pg.logistic_regression(df['body_mass_g'], df['male'],
    ...                              remove_na=True)
    >>> lom.round(2)
             names  coef    se     z  pval  CI[2.5%]  CI[97.5%]
    0    Intercept -5.16  0.71 -7.24   0.0     -6.56      -3.77
    1  body_mass_g  0.00  0.00  7.24   0.0      0.00       0.00

    Body mass is a significant predictor of sex (p<0.001). Here, it
    could be useful to rescale our predictor variable from *g* to *kg*
    (e.g divide by 1000) in order to get more intuitive coefficients and
    confidence intervals:

    >>> df['body_mass_kg'] = df['body_mass_g'] / 1000
    >>> lom = pg.logistic_regression(df['body_mass_kg'], df['male'],
    ...                              remove_na=True)
    >>> lom.round(2)
              names  coef    se     z  pval  CI[2.5%]  CI[97.5%]
    0     Intercept -5.16  0.71 -7.24   0.0     -6.56      -3.77
    1  body_mass_kg  1.23  0.17  7.24   0.0      0.89       1.56

    2. Multiple binary logistic regression

    We'll now add the species as a categorical predictor in our model. To do
    so, we first need to dummy-code our categorical variable, dropping the
    first level of our categorical variable (species = Adelie) which will be
    used as the reference level:

    >>> df = pd.get_dummies(df, columns=['species'], drop_first=True)
    >>> X = df[['body_mass_kg', 'species_Chinstrap', 'species_Gentoo']]
    >>> y = df['male']
    >>> lom = pg.logistic_regression(X, y, remove_na=True)
    >>> lom.round(2)
                   names   coef    se     z  pval  CI[2.5%]  CI[97.5%]
    0          Intercept -26.24  2.84 -9.24  0.00    -31.81     -20.67
    1       body_mass_kg   7.10  0.77  9.23  0.00      5.59       8.61
    2  species_Chinstrap  -0.13  0.42 -0.31  0.75     -0.96       0.69
    3     species_Gentoo  -9.72  1.12 -8.65  0.00    -11.92      -7.52

    3. Using NumPy aray and returning only the coefficients

    >>> pg.logistic_regression(X.to_numpy(), y.to_numpy(), coef_only=True,
    ...                        remove_na=True)
    array([-26.23906892,   7.09826571,  -0.13180626,  -9.71718529])

    4. Passing custom parameters to sklearn

    >>> lom = pg.logistic_regression(X, y, solver='sag', max_iter=10000,
    ...                           random_state=42, remove_na=True)
    >>> print(lom['coef'].to_numpy())
    [-25.98248153   7.02881472  -0.13119779  -9.62247569]

    **How to interpret the log-odds coefficients?**

    We'll use the `Wikipedia example
    <https://en.wikipedia.org/wiki/Logistic_regression#Probability_of_passing_an_exam_versus_hours_of_study>`_
    of the probability of passing an exam
    versus the hours of study:

    *A group of 20 students spends between 0 and 6 hours studying for an
    exam. How does the number of hours spent studying affect the
    probability of the student passing the exam?*

    >>> # First, let's create the dataframe
    >>> Hours = [0.50, 0.75, 1.00, 1.25, 1.50, 1.75, 1.75, 2.00, 2.25, 2.50,
    ...          2.75, 3.00, 3.25, 3.50, 4.00, 4.25, 4.50, 4.75, 5.00, 5.50]
    >>> Pass = [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1]
    >>> df = pd.DataFrame({'HoursStudy': Hours, 'PassExam': Pass})
    >>> # And then run the logistic regression
    >>> lr = pg.logistic_regression(df['HoursStudy'], df['PassExam']).round(3)
    >>> lr
            names   coef     se      z   pval  CI[2.5%]  CI[97.5%]
    0   Intercept -4.078  1.761 -2.316  0.021    -7.529     -0.626
    1  HoursStudy  1.505  0.629  2.393  0.017     0.272      2.737

    The ``Intercept`` coefficient (-4.078) is the log-odds of ``PassExam=1``
    when ``HoursStudy=0``. The odds ratio can be obtained by exponentiating
    the log-odds:

    >>> np.exp(-4.078)
    0.016941314421496552

    i.e. :math:`0.017:1`. Conversely the odds of failing the exam are
    :math:`(1/0.017) \\approx 59:1`.

    The probability can then be obtained with the following equation

    .. math:: p = \\frac{1}{1 + e^{-(-4.078 + 0 * 1.505)}}

    >>> 1 / (1 + np.exp(-(-4.078)))
    0.016659087580814722

    The ``HoursStudy`` coefficient (1.505) means that for each additional hour
    of study, the log-odds of passing the exam increase by 1.505, and the odds
    are multipled by :math:`e^{1.505} \\approx 4.50`.

    For example, a student who studies 2 hours has a probability of passing
    the exam of 25%:

    >>> 1 / (1 + np.exp(-(-4.078 + 2 * 1.505)))
    0.2557836148964987

    The table below shows the probability of passing the exam for several
    values of ``HoursStudy``:

    +----------------+----------+----------------+------------------+
    | Hours of Study | Log-odds | Odds           | Probability      |
    +================+==========+================+==================+
    | 0              | −4.08    | 0.017 ≈ 1:59   | 0.017            |
    +----------------+----------+----------------+------------------+
    | 1              | −2.57    | 0.076 ≈ 1:13   | 0.07             |
    +----------------+----------+----------------+------------------+
    | 2              | −1.07    | 0.34 ≈ 1:3     | 0.26             |
    +----------------+----------+----------------+------------------+
    | 3              | 0.44     | 1.55           | 0.61             |
    +----------------+----------+----------------+------------------+
    | 4              | 1.94     | 6.96           | 0.87             |
    +----------------+----------+----------------+------------------+
    | 5              | 3.45     | 31.4           | 0.97             |
    +----------------+----------+----------------+------------------+
    | 6              | 4.96     | 141.4          | 0.99             |
    +----------------+----------+----------------+------------------+
    """
    # Check that sklearn is installed
    from pingouin.utils import _is_sklearn_installed
    _is_sklearn_installed(raise_error=True)
    from sklearn.linear_model import LogisticRegression

    # Extract names if X is a Dataframe or Series
    if isinstance(X, pd.DataFrame):
        names = X.keys().tolist()
    elif isinstance(X, pd.Series):
        names = [X.name]
    else:
        names = []

    # Convert to numpy array
    X = np.asarray(X)
    y = np.asarray(y)
    assert y.ndim == 1, 'y must be one-dimensional.'
    assert 0 < alpha < 1, 'alpha must be between 0 and 1.'

    # Add axis if only one-dimensional array
    if X.ndim == 1:
        X = X[..., np.newaxis]

    # Check for NaN /  Inf
    if remove_na:
        X, y = rm_na(X, y[..., np.newaxis], paired=True, axis='rows')
        y = np.squeeze(y)
    y_gd = np.isfinite(y).all()
    X_gd = np.isfinite(X).all()
    assert y_gd, ("Target (y) contains NaN or Inf. Please remove them "
                  "manually or use remove_na=True.")
    assert X_gd, ("Predictors (X) contain NaN or Inf. Please remove them "
                  "manually or use remove_na=True.")

    # Check that X and y have same length
    assert y.shape[0] == X.shape[0], 'X and y must have same number of samples'

    # Check that y is binary
    if np.unique(y).size != 2:
        raise ValueError('Dependent variable must be binary.')

    if not names:
        names = ['x' + str(i + 1) for i in range(X.shape[1])]

    # We also want to make sure that there is no column
    # with only one unique value, otherwise the regression fails
    # This is equivalent, but much faster, to pd.DataFrame(X).nunique()
    idx_unique = np.where(np.all(X == X[0, :], axis=0))[0]
    if len(idx_unique):
        X = np.delete(X, idx_unique, 1)
        names = np.delete(names, idx_unique).tolist()

    # Finally, we want to remove duplicate columns
    if X.shape[1] > 1:
        idx_duplicate = []
        for pair in itertools.combinations(range(X.shape[1]), 2):
            if np.array_equal(X[:, pair[0]], X[:, pair[1]]):
                idx_duplicate.append(pair[1])
        if len(idx_duplicate):
            X = np.delete(X, idx_duplicate, 1)
            names = np.delete(names, idx_duplicate).tolist()

    # Initialize and fit
    if 'solver' not in kwargs:
        # https://stats.stackexchange.com/a/204324/253579
        # Updated in Pingouin > 0.3.6 to be consistent with R
        kwargs['solver'] = 'newton-cg'
    if 'penalty' not in kwargs:
        kwargs['penalty'] = 'none'
    lom = LogisticRegression(**kwargs)
    lom.fit(X, y)

    if lom.get_params()['fit_intercept']:
        names.insert(0, "Intercept")
        X_design = np.column_stack((np.ones(X.shape[0]), X))
        coef = np.append(lom.intercept_, lom.coef_)
    else:
        coef = lom.coef_
        X_design = X

    if coef_only:
        return coef

    # Fisher Information Matrix
    n, p = X_design.shape
    denom = (2 * (1 + np.cosh(lom.decision_function(X))))
    denom = np.tile(denom, (p, 1)).T
    fim = (X_design / denom).T @ X_design
    crao = np.linalg.pinv(fim)

    # Standard error and Z-scores
    se = np.sqrt(np.diag(crao))
    z_scores = coef / se

    # Two-tailed p-values
    pval = 2 * norm.sf(np.fabs(z_scores))

    # Wald Confidence intervals
    # In R: this is equivalent to confint.default(model)
    # Note that confint(model) will however return the profile CI
    crit = norm.ppf(1 - alpha / 2)
    ll = coef - crit * se
    ul = coef + crit * se

    # Rename CI
    ll_name = 'CI[%.1f%%]' % (100 * alpha / 2)
    ul_name = 'CI[%.1f%%]' % (100 * (1 - alpha / 2))

    # Create dict
    stats = {'names': names, 'coef': coef, 'se': se, 'z': z_scores,
             'pval': pval, ll_name: ll, ul_name: ul}
    if as_dataframe:
        return _postprocess_dataframe(pd.DataFrame(stats))
    else:
        return stats
Example #13
0
def linear_regression(X, y, add_intercept=True, weights=None, coef_only=False,
                      alpha=0.05, as_dataframe=True, remove_na=False,
                      relimp=False):
    """(Multiple) Linear regression.

    Parameters
    ----------
    X : array_like
        Predictor(s), of shape *(n_samples, n_features)* or *(n_samples)*.
    y : array_like
        Dependent variable, of shape *(n_samples)*.
    add_intercept : bool
        If False, assume that the data are already centered. If True, add a
        constant term to the model. In this case, the first value in the
        output dict is the intercept of the model.

        .. note:: It is generally recommended to include a constant term
            (intercept) to the model to limit the bias and force the residual
            mean to equal zero. The intercept coefficient and p-values
            are however rarely meaningful.
    weights : array_like
        An optional vector of sample weights to be used in the fitting
        process, of shape *(n_samples)*. Missing or negative weights are not
        allowed. If not null, a weighted least squares is calculated.

        .. versionadded:: 0.3.5
    coef_only : bool
        If True, return only the regression coefficients.
    alpha : float
        Alpha value used for the confidence intervals.
        :math:`\\text{CI} = [\\alpha / 2 ; 1 - \\alpha / 2]`
    as_dataframe : bool
        If True, returns a pandas DataFrame. If False, returns a dictionnary.
    remove_na : bool
        If True, apply a listwise deletion of missing values (i.e. the entire
        row is removed). Default is False, which will raise an error if missing
        values are present in either the predictor(s) or dependent
        variable.
    relimp : bool
        If True, returns the relative importance (= contribution) of
        predictors. This is irrelevant when the predictors are uncorrelated:
        the total :math:`R^2` of the model is simply the sum of each univariate
        regression :math:`R^2`-values. However, this does not apply when
        predictors are correlated. Instead, the total :math:`R^2` of the model
        is partitioned by averaging over all combinations of predictors,
        as done in the `relaimpo
        <https://cran.r-project.org/web/packages/relaimpo/relaimpo.pdf>`_
        R package (``calc.relimp(type="lmg")``).

        .. warning:: The computation time roughly doubles for each
            additional predictor and therefore this can be extremely slow for
            models with more than 12-15 predictors.

        .. versionadded:: 0.3.0

    Returns
    -------
    stats : :py:class:`pandas.DataFrame` or dict
        Linear regression summary:

        * ``'names'``: name of variable(s) in the model (e.g. x1, x2...)
        * ``'coef'``: regression coefficients
        * ``'se'``: standard errors
        * ``'T'``: T-values
        * ``'pval'``: p-values
        * ``'r2'``: coefficient of determination (:math:`R^2`)
        * ``'adj_r2'``: adjusted :math:`R^2`
        * ``'CI[2.5%]'``: lower confidence intervals
        * ``'CI[97.5%]'``: upper confidence intervals
        * ``'relimp'``: relative contribution of each predictor to the final\
                        :math:`R^2` (only if ``relimp=True``).
        * ``'relimp_perc'``: percent relative contribution

        In addition, the output dataframe comes with hidden attributes such as
        the residuals, and degrees of freedom of the model and residuals, which
        can be accessed as follow, respectively:

        >>> lm = pg.linear_regression() # doctest: +SKIP
        >>> lm.residuals_, lm.df_model_, lm.df_resid_ # doctest: +SKIP

        Note that to follow scikit-learn convention, these hidden atributes end
        with an "_". When ``as_dataframe=False`` however, these attributes
        are no longer hidden and can be accessed as any other keys in the
        output dictionary.

        >>> lm = pg.linear_regression() # doctest: +SKIP
        >>> lm['residuals'], lm['df_model'], lm['df_resid'] # doctest: +SKIP

        When ``as_dataframe=False`` the dictionary also contains the
        processed ``X`` and ``y`` arrays (i.e, with NaNs removed if
        ``remove_na=True``) and the model's predicted values ``pred``.

        >>> lm['X'], lm['y'], lm['pred'] # doctest: +SKIP

        For a weighted least squares fit, the weighted ``Xw`` and ``yw``
        arrays are included in the dictionary.

        >>> lm['Xw'], lm['yw'] # doctest: +SKIP

    See also
    --------
    logistic_regression, mediation_analysis, corr

    Notes
    -----
    The :math:`\\beta` coefficients are estimated using an ordinary least
    squares (OLS) regression, as implemented in the
    :py:func:`scipy.linalg.lstsq` function. The OLS method minimizes
    the sum of squared residuals, and leads to a closed-form expression for
    the estimated :math:`\\beta`:

    .. math:: \\hat{\\beta} = (X^TX)^{-1} X^Ty

    It is generally recommended to include a constant term (intercept) to the
    model to limit the bias and force the residual mean to equal zero.
    Note that intercept coefficient and p-values are however rarely meaningful.

    The standard error of the estimates is a measure of the accuracy of the
    prediction defined as:

    .. math:: \\sigma = \\sqrt{\\text{MSE} \\cdot (X^TX)^{-1}}

    where :math:`\\text{MSE}` is the mean squared error,

    .. math::

        \\text{MSE} = \\frac{SS_{\\text{resid}}}{n - p - 1}
         = \\frac{\\sum{(\\text{true} - \\text{pred})^2}}{n - p - 1}

    :math:`p` is the total number of predictor variables in the model
    (excluding the intercept) and :math:`n` is the sample size.

    Using the :math:`\\beta` coefficients and the standard errors,
    the T-values can be obtained:

    .. math:: T = \\frac{\\beta}{\\sigma}

    and the p-values approximated using a T-distribution with
    :math:`n - p - 1` degrees of freedom.

    The coefficient of determination (:math:`R^2`) is defined as:

    .. math:: R^2 = 1 - (\\frac{SS_{\\text{resid}}}{SS_{\\text{total}}})

    The adjusted :math:`R^2` is defined as:

    .. math:: \\overline{R}^2 = 1 - (1 - R^2) \\frac{n - 1}{n - p - 1}

    The relative importance (``relimp``) column is a partitioning of the
    total :math:`R^2` of the model into individual :math:`R^2` contribution.
    This is calculated by taking the average over average contributions in
    models of different sizes. For more details, please refer to
    `Groemping et al. 2006 <http://dx.doi.org/10.18637/jss.v017.i01>`_
    and the R package `relaimpo
    <https://cran.r-project.org/web/packages/relaimpo/relaimpo.pdf>`_.

    Note that Pingouin will automatically remove any duplicate columns
    from :math:`X`, as well as any column with only one unique value
    (constant), excluding the intercept.

    Results have been compared against sklearn, R, statsmodels and JASP.

    Examples
    --------
    1. Simple linear regression using columns of a pandas dataframe

    In this first example, we'll use the tips dataset to see how well we
    can predict the waiter's tip (in dollars) based on the total bill (also
    in dollars).

    >>> import numpy as np
    >>> import pingouin as pg
    >>> df = pg.read_dataset('tips')
    >>> # Let's predict the tip ($) based on the total bill (also in $)
    >>> lm = pg.linear_regression(df['total_bill'], df['tip'])
    >>> lm.round(2)
            names  coef    se      T  pval    r2  adj_r2  CI[2.5%]  CI[97.5%]
    0   Intercept  0.92  0.16   5.76   0.0  0.46    0.45      0.61       1.23
    1  total_bill  0.11  0.01  14.26   0.0  0.46    0.45      0.09       0.12

    It comes as no surprise that total bill is indeed a significant predictor
    of the waiter's tip (T=14.26, p<0.05). The :math:`R^2` of the model is 0.46
    and the adjusted :math:`R^2` is 0.45, which means that our model roughly
    explains ~45% of the total variance in the tip amount.

    2. Multiple linear regression

    We can also have more than one predictor and run a multiple linear
    regression. Below, we add the party size as a second predictor of tip.

    >>> # We'll add a second predictor: the party size
    >>> lm = pg.linear_regression(df[['total_bill', 'size']], df['tip'])
    >>> lm.round(2)
            names  coef    se      T  pval    r2  adj_r2  CI[2.5%]  CI[97.5%]
    0   Intercept  0.67  0.19   3.46  0.00  0.47    0.46      0.29       1.05
    1  total_bill  0.09  0.01  10.17  0.00  0.47    0.46      0.07       0.11
    2        size  0.19  0.09   2.26  0.02  0.47    0.46      0.02       0.36

    The party size is also a significant predictor of tip (T=2.26, p=0.02).
    Note that adding this new predictor however only improved the :math:`R^2`
    of our model by ~1%.

    This function also works with numpy arrays:

    >>> X = df[['total_bill', 'size']].to_numpy()
    >>> y = df['tip'].to_numpy()
    >>> pg.linear_regression(X, y).round(2)
           names  coef    se      T  pval    r2  adj_r2  CI[2.5%]  CI[97.5%]
    0  Intercept  0.67  0.19   3.46  0.00  0.47    0.46      0.29       1.05
    1         x1  0.09  0.01  10.17  0.00  0.47    0.46      0.07       0.11
    2         x2  0.19  0.09   2.26  0.02  0.47    0.46      0.02       0.36

    3. Get the residuals

    >>> # For clarity, only display the first 9 values
    >>> np.round(lm.residuals_, 2)[:9]
    array([-1.62, -0.55,  0.31,  0.06, -0.11,  0.93,  0.13, -0.81, -0.49])

    Using pandas, we can show a summary of the distribution of the residuals:

    >>> import pandas as pd
    >>> pd.Series(lm.residuals_).describe().round(2)
    count    244.00
    mean      -0.00
    std        1.01
    min       -2.93
    25%       -0.55
    50%       -0.09
    75%        0.51
    max        4.04
    dtype: float64

    5. No intercept and return only the regression coefficients

    Sometimes it may be useful to remove the constant term from the regression,
    or to only return the regression coefficients without calculating the
    standard errors or p-values. This latter can potentially save you a lot of
    time if you need to calculate hundreds of regression and only care about
    the coefficients!

    >>> pg.linear_regression(X, y, add_intercept=False, coef_only=True)
    array([0.1007119 , 0.36209717])

    6. Return a dictionnary instead of a dataframe

    >>> lm_dict = pg.linear_regression(X, y, as_dataframe=False)
    >>> lm_dict.keys()
    dict_keys(['names', 'coef', 'se', 'T', 'pval', 'r2', 'adj_r2', 'CI[2.5%]',
               'CI[97.5%]', 'df_model', 'df_resid', 'residuals', 'X', 'y',
               'pred'])

    7. Remove missing values

    >>> X[4, 1] = np.nan
    >>> y[7] = np.nan
    >>> pg.linear_regression(X, y, remove_na=True, coef_only=True)
    array([0.65749955, 0.09262059, 0.19927529])

    8. Get the relative importance of predictors

    >>> lm = pg.linear_regression(X, y, remove_na=True, relimp=True)
    >>> lm[['names', 'relimp', 'relimp_perc']]
           names    relimp  relimp_perc
    0  Intercept       NaN          NaN
    1         x1  0.342503    73.045583
    2         x2  0.126386    26.954417

    The ``relimp`` column is a partitioning of the total :math:`R^2` of the
    model into individual contribution. Therefore, it sums to the :math:`R^2`
    of the full model. The ``relimp_perc`` is normalized to sum to 100%. See
    `Groemping 2006 <https://www.jstatsoft.org/article/view/v017i01>`_
    for more details.

    >>> lm[['relimp', 'relimp_perc']].sum()
    relimp           0.468889
    relimp_perc    100.000000
    dtype: float64

    9. Weighted linear regression

    >>> X = [1, 2, 3, 4, 5, 6]
    >>> y = [10, 22, 11, 13, 13, 16]
    >>> w = [1, 0.1, 1, 1, 0.5, 1]  # Array of weights. Must be >= 0.
    >>> lm = pg.linear_regression(X, y, weights=w)
    >>> lm.round(2)
           names  coef    se     T  pval    r2  adj_r2  CI[2.5%]  CI[97.5%]
    0  Intercept  9.00  2.03  4.42  0.01  0.51    0.39      3.35      14.64
    1         x1  1.04  0.50  2.06  0.11  0.51    0.39     -0.36       2.44
    """
    # Extract names if X is a Dataframe or Series
    if isinstance(X, pd.DataFrame):
        names = X.keys().tolist()
    elif isinstance(X, pd.Series):
        names = [X.name]
    else:
        names = []

    # Convert input to numpy array
    X = np.asarray(X)
    y = np.asarray(y)
    assert y.ndim == 1, 'y must be one-dimensional.'
    assert 0 < alpha < 1

    if X.ndim == 1:
        # Convert to (n_samples, n_features) shape
        X = X[..., np.newaxis]

    # Check for NaN / Inf
    if remove_na:
        X, y = rm_na(X, y[..., np.newaxis], paired=True, axis='rows')
        y = np.squeeze(y)
    y_gd = np.isfinite(y).all()
    X_gd = np.isfinite(X).all()
    assert y_gd, ("Target (y) contains NaN or Inf. Please remove them "
                  "manually or use remove_na=True.")
    assert X_gd, ("Predictors (X) contain NaN or Inf. Please remove them "
                  "manually or use remove_na=True.")

    # Check that X and y have same length
    assert y.shape[0] == X.shape[0], 'X and y must have same number of samples'

    if not names:
        names = ['x' + str(i + 1) for i in range(X.shape[1])]

    if add_intercept:
        # Add intercept
        X = np.column_stack((np.ones(X.shape[0]), X))
        names.insert(0, "Intercept")

    # FINAL CHECKS BEFORE RUNNING LEAST SQUARES REGRESSION
    # 1. Let's remove column(s) with only zero, otherwise the regression fails
    n_nonzero = np.count_nonzero(X, axis=0)
    idx_zero = np.flatnonzero(n_nonzero == 0)  # Find columns that are only 0
    if len(idx_zero):
        X = np.delete(X, idx_zero, 1)
        names = np.delete(names, idx_zero)

    # 2. We also want to make sure that there is no more than one constant
    # column (= intercept), otherwise the regression fails
    # This is equivalent, but much faster, to pd.DataFrame(X).nunique()
    idx_unique = np.where(np.all(X == X[0, :], axis=0))[0]
    if len(idx_unique) > 1:
        # We remove all but the first "Intercept" column.
        X = np.delete(X, idx_unique[1:], 1)
        names = np.delete(names, idx_unique[1:])
    # Is there a constant in our predictor matrix? Useful for dof and R^2.
    constant = 1 if len(idx_unique) > 0 else 0

    # 3. Finally, we want to remove duplicate columns
    if X.shape[1] > 1:
        idx_duplicate = []
        for pair in itertools.combinations(range(X.shape[1]), 2):
            if np.array_equal(X[:, pair[0]], X[:, pair[1]]):
                idx_duplicate.append(pair[1])
        if len(idx_duplicate):
            X = np.delete(X, idx_duplicate, 1)
            names = np.delete(names, idx_duplicate)

    # 4. Check that we have enough samples / features
    n, p = X.shape[0], X.shape[1]
    assert n >= 3, 'At least three valid samples are required in X.'
    assert p >= 1, 'X must have at least one valid column.'

    # 5. Handle weights
    if weights is not None:
        if relimp:
            raise ValueError("relimp = True is not supported when using "
                             "weights.")
        w = np.asarray(weights)
        assert w.ndim == 1, 'weights must be a 1D array.'
        assert w.size == n, 'weights must be of shape n_samples.'
        assert not np.isnan(w).any(), 'Missing weights are not accepted.'
        assert not (w < 0).any(), 'Negative weights are not accepted.'
        # Do not count weights == 0 in dof
        # This gives similar results as R lm() but different from statsmodels
        n = np.count_nonzero(w)
        # Rescale (whitening)
        wts = np.diag(np.sqrt(w))
        Xw = wts @ X
        yw = wts @ y
    else:
        # Set all weights to one, [1, 1, 1, ...]
        w = np.ones(n)
        Xw = X
        yw = y

    # FIT (WEIGHTED) LEAST SQUARES REGRESSION USING SCIPY.LINALG.LSTST
    coef, ss_res, rank, _ = lstsq(Xw, yw)
    if coef_only:
        return coef

    # Degrees of freedom
    df_model = rank - constant
    df_resid = n - p

    # Calculate predicted values and (weighted) residuals
    pred = Xw @ coef
    resid = yw - pred
    # ss_res = (resid ** 2).sum()

    # Calculate total (weighted) sums of squares and R^2
    ss_tot = yw @ yw
    ss_wtot = np.sum(w * (y - np.average(y, weights=w))**2)
    if constant:
        r2 = 1 - ss_res / ss_wtot
    else:
        r2 = 1 - ss_res / ss_tot
    adj_r2 = 1 - (1 - r2) * (n - constant) / df_resid

    # Compute mean squared error, variance and SE
    mse = ss_res / df_resid
    beta_var = mse * (np.linalg.pinv(Xw.T @ Xw).diagonal())
    beta_se = np.sqrt(beta_var)

    # Compute T and p-values
    T = coef / beta_se
    pval = 2 * t.sf(np.fabs(T), df_resid)

    # Compute confidence intervals
    crit = t.ppf(1 - alpha / 2, df_resid)
    marg_error = crit * beta_se
    ll = coef - marg_error
    ul = coef + marg_error

    # Rename CI
    ll_name = 'CI[%.1f%%]' % (100 * alpha / 2)
    ul_name = 'CI[%.1f%%]' % (100 * (1 - alpha / 2))

    # Create dict
    stats = {'names': names, 'coef': coef, 'se': beta_se, 'T': T,
             'pval': pval, 'r2': r2, 'adj_r2': adj_r2, ll_name: ll,
             ul_name: ul}

    # Relative importance
    if relimp:
        data = pd.concat([pd.DataFrame(y, columns=['y']),
                          pd.DataFrame(X, columns=names)], sort=False, axis=1)
        if 'Intercept' in names:
            # Intercept is the first column
            reli = _relimp(data.drop(columns=['Intercept']).cov())
            reli['names'] = ['Intercept'] + reli['names']
            reli['relimp'] = np.insert(reli['relimp'], 0, np.nan)
            reli['relimp_perc'] = np.insert(reli['relimp_perc'], 0, np.nan)
        else:
            reli = _relimp(data.cov())
        stats.update(reli)

    if as_dataframe:
        stats = _postprocess_dataframe(pd.DataFrame(stats))
        stats.df_model_ = df_model
        stats.df_resid_ = df_resid
        stats.residuals_ = 0  # Trick to avoid Pandas warning
        stats.residuals_ = resid  # Residuals is a hidden attribute
    else:
        stats['df_model'] = df_model
        stats['df_resid'] = df_resid
        stats['residuals'] = resid
        stats['X'] = X
        stats['y'] = y
        stats['pred'] = pred
        if weights is not None:
            stats['yw'] = yw
            stats['Xw'] = Xw
    return stats
Example #14
0
def intraclass_corr(data=None,
                    targets=None,
                    raters=None,
                    ratings=None,
                    nan_policy='raise'):
    """Intraclass correlation.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        Long-format dataframe. Data must be fully balanced.
    targets : string
        Name of column in ``data`` containing the targets.
    raters : string
        Name of column in ``data`` containing the raters.
    ratings : string
        Name of column in ``data`` containing the ratings.
    nan_policy : str
        Defines how to handle when input contains missing values (nan).
        `'raise'` (default) throws an error, `'omit'` performs the calculations
        after deleting target(s) with one or more missing values (= listwise
        deletion).

        .. versionadded:: 0.3.0

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`
        Output dataframe:

        * ``'Type'``: ICC type
        * ``'Description'``: description of the ICC
        * ``'ICC'``: intraclass correlation
        * ``'F'``: F statistic
        * ``'df1'``: numerator degree of freedom
        * ``'df2'``: denominator degree of freedom
        * ``'pval'``: p-value
        * ``'CI95%'``: 95% confidence intervals around the ICC

    Notes
    -----
    The intraclass correlation (ICC, [1]_) assesses the reliability of ratings
    by comparing the variability of different ratings of the same subject to
    the total variation across all ratings and all subjects.

    Shrout and Fleiss (1979) [2]_ describe six cases of reliability of ratings
    done by :math:`k` raters on :math:`n` targets. Pingouin returns all six
    cases with corresponding F and p-values, as well as 95% confidence
    intervals.

    From the documentation of the ICC function in the `psych
    <https://cran.r-project.org/web/packages/psych/psych.pdf>`_ R package:

    - **ICC1**: Each target is rated by a different rater and the raters are
      selected at random. This is a one-way ANOVA fixed effects model.

    - **ICC2**: A random sample of :math:`k` raters rate each target. The
      measure is one of absolute agreement in the ratings. ICC1 is sensitive
      to differences in means between raters and is a measure of absolute
      agreement.

    - **ICC3**: A fixed set of :math:`k` raters rate each target. There is no
      generalization to a larger population of raters. ICC2 and ICC3 remove
      mean differences between raters, but are sensitive to interactions.
      The difference between ICC2 and ICC3 is whether raters are seen as fixed
      or random effects.

    Then, for each of these cases, the reliability can either be estimated for
    a single rating or for the average of :math:`k` ratings. The 1 rating case
    is equivalent to the average intercorrelation, while the :math:`k` rating
    case is equivalent to the Spearman Brown adjusted reliability.
    **ICC1k**, **ICC2k**, **ICC3K** reflect the means of :math:`k` raters.

    This function has been tested against the ICC function of the R psych
    package. Note however that contrarily to the R implementation, the
    current implementation does not use linear mixed effect but regular ANOVA,
    which means that it only works with complete-case data (no missing values).

    References
    ----------
    .. [1] http://www.real-statistics.com/reliability/intraclass-correlation/

    .. [2] Shrout, P. E., & Fleiss, J. L. (1979). Intraclass correlations:
           uses in assessing rater reliability. Psychological bulletin, 86(2),
           420.

    Examples
    --------
    ICCs of wine quality assessed by 4 judges.

    >>> import pingouin as pg
    >>> data = pg.read_dataset('icc')
    >>> icc = pg.intraclass_corr(data=data, targets='Wine', raters='Judge',
    ...                          ratings='Scores').round(3)
    >>> icc.set_index("Type")
                       Description    ICC       F  df1  df2  pval         CI95%
    Type
    ICC1    Single raters absolute  0.728  11.680    7   24   0.0  [0.43, 0.93]
    ICC2      Single random raters  0.728  11.787    7   21   0.0  [0.43, 0.93]
    ICC3       Single fixed raters  0.729  11.787    7   21   0.0  [0.43, 0.93]
    ICC1k  Average raters absolute  0.914  11.680    7   24   0.0  [0.75, 0.98]
    ICC2k    Average random raters  0.914  11.787    7   21   0.0  [0.75, 0.98]
    ICC3k     Average fixed raters  0.915  11.787    7   21   0.0  [0.75, 0.98]
    """
    from pingouin import anova

    # Safety check
    assert isinstance(data, pd.DataFrame), 'data must be a dataframe.'
    assert all([v is not None for v in [targets, raters, ratings]])
    assert all([v in data.columns for v in [targets, raters, ratings]])
    assert nan_policy in ['omit', 'raise']

    # Convert data to wide-format
    data = data.pivot_table(index=targets, columns=raters, values=ratings)

    # Listwise deletion of missing values
    nan_present = data.isna().any().any()
    if nan_present:
        if nan_policy == 'omit':
            data = data.dropna(axis=0, how='any')
        else:
            raise ValueError("Either missing values are present in data or "
                             "data are unbalanced. Please remove them "
                             "manually or use nan_policy='omit'.")

    # Back to long-format
    # data_wide = data.copy()  # Optional, for PCA
    data = data.reset_index().melt(id_vars=targets, value_name=ratings)

    # Check that ratings is a numeric variable
    assert data[ratings].dtype.kind in 'bfiu', 'Ratings must be numeric.'
    # Check that data are fully balanced
    # This behavior is ensured by the long-to-wide-to-long transformation
    # Unbalanced data will result in rows with missing values.
    # assert data.groupby(raters)[ratings].count().nunique() == 1

    # Extract sizes
    k = data[raters].nunique()
    n = data[targets].nunique()

    # Two-way ANOVA
    with np.errstate(invalid='ignore'):
        # For max precision, make sure rounding is disabled
        old_options = options.copy()
        options['round'] = None
        aov = anova(data=data,
                    dv=ratings,
                    between=[targets, raters],
                    ss_type=2)
        options.update(old_options)  # restore options

    # Extract mean squares
    msb = aov.at[0, 'MS']
    msw = (aov.at[1, 'SS'] + aov.at[2, 'SS']) / (aov.at[1, 'DF'] +
                                                 aov.at[2, 'DF'])
    msj = aov.at[1, 'MS']
    mse = aov.at[2, 'MS']

    # Calculate ICCs
    icc1 = (msb - msw) / (msb + (k - 1) * msw)
    icc2 = (msb - mse) / (msb + (k - 1) * mse + k * (msj - mse) / n)
    icc3 = (msb - mse) / (msb + (k - 1) * mse)
    icc1k = (msb - msw) / msb
    icc2k = (msb - mse) / (msb + (msj - mse) / n)
    icc3k = (msb - mse) / msb

    # Calculate F, df, and p-values
    f1k = msb / msw
    df1 = n - 1
    df1kd = n * (k - 1)
    p1k = f.sf(f1k, df1, df1kd)

    f2k = f3k = msb / mse
    df2kd = (n - 1) * (k - 1)
    p2k = f.sf(f2k, df1, df2kd)

    # Create output dataframe
    stats = {
        'Type': ['ICC1', 'ICC2', 'ICC3', 'ICC1k', 'ICC2k', 'ICC3k'],
        'Description': [
            'Single raters absolute', 'Single random raters',
            'Single fixed raters', 'Average raters absolute',
            'Average random raters', 'Average fixed raters'
        ],
        'ICC': [icc1, icc2, icc3, icc1k, icc2k, icc3k],
        'F': [f1k, f2k, f2k, f1k, f2k, f2k],
        'df1':
        n - 1,
        'df2': [df1kd, df2kd, df2kd, df1kd, df2kd, df2kd],
        'pval': [p1k, p2k, p2k, p1k, p2k, p2k]
    }

    stats = pd.DataFrame(stats)

    # Calculate confidence intervals
    alpha = 0.05
    # Case 1 and 3
    f1l = f1k / f.ppf(1 - alpha / 2, df1, df1kd)
    f1u = f1k * f.ppf(1 - alpha / 2, df1kd, df1)
    l1 = (f1l - 1) / (f1l + (k - 1))
    u1 = (f1u - 1) / (f1u + (k - 1))
    f3l = f3k / f.ppf(1 - alpha / 2, df1, df2kd)
    f3u = f3k * f.ppf(1 - alpha / 2, df2kd, df1)
    l3 = (f3l - 1) / (f3l + (k - 1))
    u3 = (f3u - 1) / (f3u + (k - 1))
    # Case 2
    fj = msj / mse
    vn = df2kd * ((k * icc2 * fj + n * (1 + (k - 1) * icc2) - k * icc2))**2
    vd = df1 * k**2 * icc2**2 * fj**2 + \
        (n * (1 + (k - 1) * icc2) - k * icc2)**2
    v = vn / vd
    f2u = f.ppf(1 - alpha / 2, n - 1, v)
    f2l = f.ppf(1 - alpha / 2, v, n - 1)
    l2 = n * (msb - f2u * mse) / (f2u * (k * msj +
                                         (k * n - k - n) * mse) + n * msb)
    u2 = n * (f2l * msb - mse) / (k * msj +
                                  (k * n - k - n) * mse + n * f2l * msb)

    stats['CI95%'] = [
        np.array([l1, u1]),
        np.array([l2, u2]),
        np.array([l3, u3]),
        np.array([1 - 1 / f1l, 1 - 1 / f1u]),
        np.array([l2 * k / (1 + l2 * (k - 1)), u2 * k / (1 + u2 * (k - 1))]),
        np.array([1 - 1 / f3l, 1 - 1 / f3u])
    ]

    return _postprocess_dataframe(stats)