Ejemplo n.º 1
0
def friedman(data=None, dv=None, within=None, subject=None, method='chisq'):
    """Friedman test for repeated measurements.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        DataFrame
    dv : string
        Name of column containing the dependent variable.
    within : string
        Name of column containing the within-subject factor.
    subject : string
        Name of column containing the subject identifier.
    method : string
        Statistical test to perform. Must be ``'chisq'`` (chi-square test) or ``'f'`` (F test).
        See notes below for explanation.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'W'``: Kendall's coefficient of concordance, corrected for ties

        If ``method='chisq'``

            * ``'Q'``: The Friedman chi-square statistic, corrected for ties
            * ``'dof'``: degrees of freedom
            * ``'p-unc'``: Uncorrected p-value of the chi squared test


        If ``method='f'``

            * ``'F'``: The Friedman F statistic, corrected for ties
            * ``'dof1'``: degrees of freedom of the numerator
            * ``'dof2'``: degrees of freedom of the denominator
            * ``'p-unc'``: Uncorrected p-value of the F test

    Notes
    -----
    The Friedman test is used for one-way repeated measures ANOVA by ranks.

    Data are expected to be in long-format.

    Note that if the dataset contains one or more other within subject
    factors, an automatic collapsing to the mean is applied on the dependent
    variable (same behavior as the ezANOVA R package). As such, results can
    differ from those of JASP. If you can, always double-check the results.

    NaN values are automatically removed.

    The Friedman test is equivalent to the test of significance of Kendalls's
    coefficient of concordance (Kendall's W). Most commonly a Q statistic,
    which has asymptotical chi-squared distribution, is computed and used for
    testing. However, in [1]_ they showed the chi-squared test to be overly
    conservative for small numbers of samples and repeated measures. Instead
    they recommend the F test, which has the correct size and behaves like a
    permutation test, but is computationaly much easier.

    References
    ----------
    .. [1] Marozzi, M. (2014). Testing for concordance between several
           criteria. Journal of Statistical Computation and Simulation,
           84(9), 1843–1850. https://doi.org/10.1080/00949655.2013.766189

    Examples
    --------
    Compute the Friedman test for repeated measurements.

    >>> from pingouin import friedman, read_dataset
    >>> df = read_dataset('rm_anova')
    >>> friedman(data=df, dv='DesireToKill', within='Disgustingness',
    ...          subject='Subject')
                      Source         W  ddof1         Q     p-unc
    Friedman  Disgustingness  0.099224      1  9.227848  0.002384


    This time we will use the F test method.

    >>> from pingouin import friedman, read_dataset
    >>> df = read_dataset('rm_anova')
    >>> friedman(data=df, dv='DesireToKill', within='Disgustingness',
    ...          subject='Subject', method='f')
                      Source         W     ddof1      ddof2         F     p-unc
    Friedman  Disgustingness  0.099224  0.978495  90.021505  10.13418  0.002138

    We can see, compared to the previous example, that the p-value is slightly
    lower. This is expected, since the F test is more powerful (see Notes).
    """
    # Check data
    _check_dataframe(dv=dv, within=within, data=data, subject=subject,
                     effects='within')

    # Convert Categorical columns to string
    # This is important otherwise all the groupby will return different results
    # unless we specify .groupby(..., observed = True).
    for c in [subject, within]:
        if data[c].dtype.name == 'category':
            data[c] = data[c].astype(str)

    # Collapse to the mean
    data = data.groupby([subject, within]).mean().reset_index()

    # Remove NaN
    if data[dv].isnull().any():
        data = remove_rm_na(dv=dv, within=within, subject=subject,
                            data=data[[subject, within, dv]])

    # Extract number of groups and total sample size
    grp = data.groupby(within)[dv]
    rm = list(data[within].unique())
    k = len(rm)
    X = np.array([grp.get_group(r).to_numpy() for r in rm]).T
    n = X.shape[0]

    # Rank per subject
    ranked = np.zeros(X.shape)
    for i in range(n):
        ranked[i] = scipy.stats.rankdata(X[i, :])

    ssbn = (ranked.sum(axis=0)**2).sum()

    # Correction for ties
    ties = 0
    for i in range(n):
        replist, repnum = scipy.stats.find_repeats(X[i])
        for t in repnum:
            ties += t * (t * t - 1)

    # Compute Kendall's W corrected for ties
    W = (12 * ssbn - 3 * n * n * k * (k + 1) * (k + 1)) / (n * n * k * (k - 1) * (k + 1) - n * ties)

    if method == 'chisq':
        # Compute the Q statistic
        Q = n * (k - 1) * W

        # Approximate the p-value
        ddof1 = k - 1
        p_unc = scipy.stats.chi2.sf(Q, ddof1)

        # Create output dataframe
        stats = pd.DataFrame({'Source': within,
                              'W': W,
                              'ddof1': ddof1,
                              'Q': Q,
                              'p-unc': p_unc,
                              }, index=['Friedman'])
    elif method == 'f':
        # Compute the F statistic
        F = W * (n - 1) / (1 - W)

        # Approximate the p-value
        ddof1 = k - 1 - 2 / n
        ddof2 = (n - 1) * ddof1
        p_unc = scipy.stats.f.sf(F, ddof1, ddof2)

        # Create output dataframe
        stats = pd.DataFrame({'Source': within,
                              'W': W,
                              'ddof1': ddof1,
                              'ddof2': ddof2,
                              'F': F,
                              'p-unc': p_unc,
                              }, index=['Friedman'])

    return _postprocess_dataframe(stats)
Ejemplo n.º 2
0
def cochran(data=None, dv=None, within=None, subject=None):
    """Cochran Q test. A special case of the Friedman test when the dependent
    variable is binary.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        DataFrame
    dv : string
        Name of column containing the binary dependent variable.
    within : string
        Name of column containing the within-subject factor.
    subject : string
        Name of column containing the subject identifier.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'Q'``: The Cochran Q statistic
        * ``'p-unc'``: Uncorrected p-value
        * ``'dof'``: degrees of freedom

    Notes
    -----
    The Cochran Q test [1]_ is a non-parametric test for ANOVA with repeated
    measures where the dependent variable is binary.

    Data are expected to be in long-format. NaN are automatically removed
    from the data.

    The Q statistics is defined as:

    .. math:: Q = \\frac{(r-1)(r\\sum_j^rx_j^2-N^2)}{rN-\\sum_i^nx_i^2}

    where :math:`N` is the total sum of all observations, :math:`j=1,...,r`
    where :math:`r` is the number of repeated measures, :math:`i=1,...,n` where
    :math:`n` is the number of observations per condition.

    The p-value is then approximated using a chi-square distribution with
    :math:`r-1` degrees of freedom:

    .. math:: Q \\sim \\chi^2(r-1)

    References
    ----------
    .. [1] Cochran, W.G., 1950. The comparison of percentages in matched
       samples. Biometrika 37, 256–266.
       https://doi.org/10.1093/biomet/37.3-4.256

    Examples
    --------
    Compute the Cochran Q test for repeated measurements.

    >>> from pingouin import cochran, read_dataset
    >>> df = read_dataset('cochran')
    >>> cochran(data=df, dv='Energetic', within='Time', subject='Subject')
            Source  dof         Q     p-unc
    cochran   Time    2  6.705882  0.034981
    """
    # Check data
    _check_dataframe(dv=dv, within=within, data=data, subject=subject,
                     effects='within')

    # Convert Categorical columns to string
    # This is important otherwise all the groupby will return different results
    # unless we specify .groupby(..., observed = True).
    for c in [subject, within]:
        if data[c].dtype.name == 'category':
            data[c] = data[c].astype(str)

    # Remove NaN
    if data[dv].isnull().any():
        data = remove_rm_na(dv=dv, within=within, subject=subject,
                            data=data[[subject, within, dv]])

    # Groupby and extract size
    grp = data.groupby(within)[dv]
    grp_s = data.groupby(subject)[dv]
    k = data[within].nunique()
    dof = k - 1
    # n = grp.count().unique()[0]

    # Q statistic and p-value
    q = (dof * (k * np.sum(grp.sum()**2) - grp.sum().sum()**2)) / \
        (k * grp.sum().sum() - np.sum(grp_s.sum()**2))
    p_unc = scipy.stats.chi2.sf(q, dof)

    # Create output dataframe
    stats = pd.DataFrame({'Source': within,
                          'dof': dof,
                          'Q': q,
                          'p-unc': p_unc,
                          }, index=['cochran'])

    return _postprocess_dataframe(stats)
Ejemplo n.º 3
0
def wilcoxon(x, y, tail='two-sided'):
    """Wilcoxon signed-rank test. It is the non-parametric version of the
    paired T-test.

    Parameters
    ----------
    x, y : array_like
        First and second set of observations. ``x`` and ``y`` must be
        related (e.g repeated measures) and, therefore, have the same number
        of samples. Note that a listwise deletion of missing values
        is automatically applied.
    tail : string
        Specify whether to return `'one-sided'` or `'two-sided'` p-value.
        Can also be `'greater'` or `'less'` to specify the direction of the
        test. If ``tail='one-sided'``, the alternative of the test will be
        automatically detected by looking at the sign of the median of the
        differences between ``x`` and ``y``.
        For instance, if ``np.median(x - y) > 0`` and ``tail='one-sided'``,
        Pingouin will automatically set ``tail='greater'`` and vice versa.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'W-val'``: W-value
        * ``'p-val'``: p-value
        * ``'RBC'``   : matched pairs rank-biserial correlation (effect size)
        * ``'CLES'``  : common language effect size

    See also
    --------
    scipy.stats.wilcoxon, mwu

    Notes
    -----
    The Wilcoxon signed-rank test [1]_ tests the null hypothesis that two
    related paired samples come from the same distribution. In particular,
    it tests whether the distribution of the differences x - y is symmetric
    about zero. A continuity correction is applied by default
    (see :py:func:`scipy.stats.wilcoxon` for details).

    The matched pairs rank biserial correlation [2]_ is the simple difference
    between the proportion of favorable and unfavorable evidence; in the case
    of the Wilcoxon signed-rank test, the evidence consists of rank sums
    (Kerby 2014):

    .. math:: r = f - u

    The common language effect size is the proportion of pairs where ``x`` is
    higher than ``y``. It was first introduced by McGraw and Wong (1992) [3]_.
    Pingouin uses a brute-force version of the formula given by Vargha and
    Delaney 2000 [4]_:

    .. math:: \\text{CL} = P(X > Y) + .5 \\times P(X = Y)

    The advantage is of this method are twofold. First, the brute-force
    approach pairs each observation of ``x`` to its ``y`` counterpart, and
    therefore does not require normally distributed data. Second, the formula
    takes ties into account and therefore works with ordinal data.

    When tail is ``'less'``, the CLES is then set to :math:`1 - \\text{CL}`,
    which gives the proportion of pairs where ``x`` is *lower* than ``y``.

    References
    ----------
    .. [1] Wilcoxon, F. (1945). Individual comparisons by ranking methods.
           Biometrics bulletin, 1(6), 80-83.

    .. [2] Kerby, D. S. (2014). The simple difference formula: An approach to
           teaching nonparametric correlation. Comprehensive Psychology,
           3, 11-IT.

    .. [3] McGraw, K. O., & Wong, S. P. (1992). A common language effect size
           statistic. Psychological bulletin, 111(2), 361.

    .. [4] Vargha, A., & Delaney, H. D. (2000). A Critique and Improvement of
           the “CL” Common Language Effect Size Statistics of McGraw and Wong.
           Journal of Educational and Behavioral Statistics: A Quarterly
           Publication Sponsored by the American Educational Research
           Association and the American Statistical Association, 25(2),
           101–132. https://doi.org/10.2307/1165329

    Examples
    --------
    Wilcoxon test on two related samples.

    >>> import numpy as np
    >>> import pingouin as pg
    >>> x = [20, 22, 19, 20, 22, 18, 24, 20, 19, 24, 26, 13]
    >>> y = [38, 37, 33, 29, 14, 12, 20, 22, 17, 25, 26, 16]
    >>> pg.wilcoxon(x, y, tail='two-sided')
              W-val       tail     p-val       RBC      CLES
    Wilcoxon   20.5  two-sided  0.285765 -0.378788  0.395833

    Compare with SciPy

    >>> import scipy
    >>> scipy.stats.wilcoxon(x, y, correction=True)
    WilcoxonResult(statistic=20.5, pvalue=0.2857652190231508)

    One-sided tail: one can either manually specify the alternative hypothesis

    >>> pg.wilcoxon(x, y, tail='greater')
              W-val     tail     p-val       RBC      CLES
    Wilcoxon   20.5  greater  0.876244 -0.378788  0.395833

    >>> pg.wilcoxon(x, y, tail='less')
              W-val  tail     p-val       RBC      CLES
    Wilcoxon   20.5  less  0.142883 -0.378788  0.604167

    Or simply leave it to Pingouin, using the `'one-sided'` argument, in which
    case Pingouin will look at the sign of the median of the differences
    between ``x`` and ``y`` and ajust the tail based on that:

    >>> np.median(np.array(x) - np.array(y))
    -1.5

    The median is negative, so Pingouin will test for the alternative
    hypothesis that the median of the differences is negative (= less than 0).

    >>> pg.wilcoxon(x, y, tail='one-sided')  # Equivalent to tail = 'less'
              W-val  tail     p-val       RBC      CLES
    Wilcoxon   20.5  less  0.142883 -0.378788  0.604167
    """
    x = np.asarray(x)
    y = np.asarray(y)
    x, y = remove_na(x, y, paired=True)  # Remove NA

    # Check tails
    possible_tails = ['two-sided', 'one-sided', 'greater', 'less']
    assert tail in possible_tails, 'Invalid tail argument.'
    if tail == 'one-sided':
        # Detect the direction of the test based on the median
        tail = 'less' if np.median(x - y) < 0 else 'greater'

    # Compute test
    wval, pval = scipy.stats.wilcoxon(x, y, zero_method='wilcox',
                                      correction=True, alternative=tail)

    # Effect size 1: Common Language Effect Size
    # Since Pingouin v0.3.5, CLES is tail-specific and calculated
    # according to the formula given in Vargha and Delaney 2000 which
    # works with ordinal data.
    diff = x[:, None] - y
    # cles = max((diff < 0).sum(), (diff > 0).sum()) / diff.size
    # Tail = 'greater', with ties set to 0.5
    # Note that tail = 'two-sided' gives same output as tail = 'greater'
    cles = np.where(diff == 0, 0.5, diff > 0).mean()
    cles = 1 - cles if tail == 'less' else cles

    # Effect size 2: matched-pairs rank biserial correlation (Kerby 2014)
    d = x - y
    d = d[d != 0]
    r = scipy.stats.rankdata(abs(d))
    rsum = r.sum()
    r_plus = np.sum((d > 0) * r)
    r_minus = np.sum((d < 0) * r)
    rbc = r_plus / rsum - r_minus / rsum

    # Fill output DataFrame
    stats = pd.DataFrame({
        'W-val': wval,
        'tail': tail,
        'p-val': pval,
        'RBC': rbc,
        'CLES': cles}, index=['Wilcoxon'])
    return _postprocess_dataframe(stats)
Ejemplo n.º 4
0
def kruskal(data=None, dv=None, between=None, detailed=False):
    """Kruskal-Wallis H-test for independent samples.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        DataFrame
    dv : string
        Name of column containing the dependent variable.
    between : string
        Name of column containing the between factor.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'H'``: The Kruskal-Wallis H statistic, corrected for ties
        * ``'p-unc'``: Uncorrected p-value
        * ``'dof'``: degrees of freedom

    Notes
    -----
    The Kruskal-Wallis H-test tests the null hypothesis that the population
    median of all of the groups are equal. It is a non-parametric version of
    ANOVA. The test works on 2 or more independent samples, which may have
    different sizes.

    Due to the assumption that H has a chi square distribution, the number of
    samples in each group must not be too small. A typical rule is that each
    sample must have at least 5 measurements.

    NaN values are automatically removed.

    Examples
    --------
    Compute the Kruskal-Wallis H-test for independent samples.

    >>> from pingouin import kruskal, read_dataset
    >>> df = read_dataset('anova')
    >>> kruskal(data=df, dv='Pain threshold', between='Hair color')
                 Source  ddof1         H     p-unc
    Kruskal  Hair color      3  10.58863  0.014172
    """
    # Check data
    _check_dataframe(dv=dv, between=between, data=data,
                     effects='between')

    # Remove NaN values
    data = data[[dv, between]].dropna()

    # Reset index (avoid duplicate axis error)
    data = data.reset_index(drop=True)

    # Extract number of groups and total sample size
    n_groups = data[between].nunique()
    n = data[dv].size

    # Rank data, dealing with ties appropriately
    data['rank'] = scipy.stats.rankdata(data[dv])

    # Find the total of rank per groups
    grp = data.groupby(between, observed=True)['rank']
    sum_rk_grp = grp.sum().to_numpy()
    n_per_grp = grp.count().to_numpy()

    # Calculate chi-square statistic (H)
    H = (12 / (n * (n + 1)) * np.sum(sum_rk_grp**2 / n_per_grp)) - 3 * (n + 1)

    # Correct for ties
    H /= scipy.stats.tiecorrect(data['rank'].to_numpy())

    # Calculate DOF and p-value
    ddof1 = n_groups - 1
    p_unc = scipy.stats.chi2.sf(H, ddof1)

    # Create output dataframe
    stats = pd.DataFrame({'Source': between,
                          'ddof1': ddof1,
                          'H': H,
                          'p-unc': p_unc,
                          }, index=['Kruskal'])
    return _postprocess_dataframe(stats)
Ejemplo n.º 5
0
def cochran(data=None, dv=None, within=None, subject=None):
    """Cochran Q test. A special case of the Friedman test when the dependent
    variable is binary.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        DataFrame. Both wide and long-format dataframe are supported for this test.
    dv : string
        Name of column containing the dependent variable (only required if ``data`` is in
        long format).
    within : string
        Name of column containing the within-subject factor (only required if ``data`` is in
        long format). Two or more within-factor are not currently supported.
    subject : string
        Name of column containing the subject/rater identifier (only required if ``data`` is in
        long format).

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'Q'``: The Cochran Q statistic
        * ``'p-unc'``: Uncorrected p-value
        * ``'dof'``: degrees of freedom

    Notes
    -----
    The Cochran Q test [1]_ is a non-parametric test for ANOVA with repeated
    measures where the dependent variable is binary.

    The Q statistics is defined as:

    .. math:: Q = \\frac{(r-1)(r\\sum_j^rx_j^2-N^2)}{rN-\\sum_i^nx_i^2}

    where :math:`N` is the total sum of all observations, :math:`j=1,...,r`
    where :math:`r` is the number of repeated measures, :math:`i=1,...,n` where
    :math:`n` is the number of observations per condition.

    The p-value is then approximated using a chi-square distribution with
    :math:`r-1` degrees of freedom:

    .. math:: Q \\sim \\chi^2(r-1)

    Data are expected to be in long-format. Missing values are automatically removed using a
    strict listwise approach (= complete-case analysis). In other words, any subject with one or
    more missing value(s) is completely removed from the dataframe prior to running the
    test.

    References
    ----------
    .. [1] Cochran, W.G., 1950. The comparison of percentages in matched
       samples. Biometrika 37, 256–266.
       https://doi.org/10.1093/biomet/37.3-4.256

    Examples
    --------
    Compute the Cochran Q test for repeated measurements.

    >>> from pingouin import cochran, read_dataset
    >>> df = read_dataset('cochran')
    >>> cochran(data=df, dv='Energetic', within='Time', subject='Subject')
            Source  dof         Q     p-unc
    cochran   Time    2  6.705882  0.034981

    Same but using a wide-format dataframe

    >>> df_wide = df.pivot_table(index="Subject", columns="Time", values="Energetic")
    >>> cochran(df_wide)
             Source  dof         Q     p-unc
    cochran  Within    2  6.705882  0.034981
    """
    # Convert from wide to long-format, if needed
    if all([v is None for v in [dv, within, subject]]):
        assert isinstance(data, pd.DataFrame)
        data = data._get_numeric_data().dropna()  # Listwise deletion of missing values
        assert data.shape[0] > 2, "Data must have at least 3 non-missing rows."
        assert data.shape[1] > 1, "Data must contain at least two columns."
        data['Subj'] = np.arange(data.shape[0])
        data = data.melt(id_vars='Subj', var_name='Within', value_name='DV')
        subject, within, dv = 'Subj', 'Within', 'DV'

    # Check data
    _check_dataframe(dv=dv, within=within, data=data, subject=subject, effects='within')
    assert not data[within].isnull().any(), "Cannot have missing values in `within`."
    assert not data[subject].isnull().any(), "Cannot have missing values in `subject`."

    # Pivot and melt the table. This has several effects:
    # 1) Force missing values to be explicit (a NaN cell is created)
    # 2) Automatic collapsing to the mean if multiple within factors are present
    # 3) If using dropna, remove rows with missing values (listwise deletion).
    # The latter is the same behavior as JASP (= strict complete-case analysis).
    data_piv = data.pivot_table(index=subject, columns=within, values=dv, observed=True)
    data_piv = data_piv.dropna()
    data = data_piv.melt(ignore_index=False, value_name=dv).reset_index()

    # Groupby and extract size
    grp = data.groupby(within, observed=True)[dv]
    grp_s = data.groupby(subject, observed=True)[dv]
    k = data[within].nunique()
    dof = k - 1
    # n = grp.count().unique()[0]

    # Q statistic and p-value
    q = (dof * (k * np.sum(grp.sum()**2) - grp.sum().sum()**2)) / \
        (k * grp.sum().sum() - np.sum(grp_s.sum()**2))
    p_unc = scipy.stats.chi2.sf(q, dof)

    # Create output dataframe
    stats = pd.DataFrame({'Source': within, 'dof': dof, 'Q': q, 'p-unc': p_unc}, index=['cochran'])
    return _postprocess_dataframe(stats)
Ejemplo n.º 6
0
def mwu(x, y, tail='two-sided'):
    """Mann-Whitney U Test (= Wilcoxon rank-sum test). It is the non-parametric
    version of the independent T-test.

    Parameters
    ----------
    x, y : array_like
        First and second set of observations. ``x`` and ``y`` must be
        independent.
    tail : string
        Specify whether to return `'one-sided'` or `'two-sided'` p-value.
        Can also be `'greater'` or `'less'` to specify the direction of the
        test. If ``tail='one-sided'``, the alternative of the test will be
        automatically detected by comparing the medians of ``x`` and ``y``.
        For instance, if median(``x``) < median(``y``) and
        ``tail='one-sided'``, Pingouin will automatically set ``tail='less'``,
        and vice versa.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'U-val'``: U-value
        * ``'p-val'``: p-value
        * ``'RBC'``   : rank-biserial correlation
        * ``'CLES'``  : common language effect size

    See also
    --------
    scipy.stats.mannwhitneyu, wilcoxon, ttest

    Notes
    -----
    The Mann–Whitney U test [1]_ (also called Wilcoxon rank-sum test) is a
    non-parametric test of the null hypothesis that it is equally likely that
    a randomly selected value from one sample will be less than or greater
    than a randomly selected value from a second sample. The test assumes
    that the two samples are independent. This test corrects for ties and by
    default uses a continuity correction
    (see :py:func:`scipy.stats.mannwhitneyu` for details).

    The rank biserial correlation [2]_ is the difference between
    the proportion of favorable evidence minus the proportion of unfavorable
    evidence.

    The common language effect size is the proportion of pairs where ``x`` is
    higher than ``y``. It was first introduced by McGraw and Wong (1992) [3]_.
    Pingouin uses a brute-force version of the formula given by Vargha and
    Delaney 2000 [4]_:

    .. math:: \\text{CL} = P(X > Y) + .5 \\times P(X = Y)

    The advantage is of this method are twofold. First, the brute-force
    approach pairs each observation of ``x`` to its ``y`` counterpart, and
    therefore does not require normally distributed data. Second, the formula
    takes ties into account and therefore works with ordinal data.

    When tail is ``'less'``, the CLES is then set to :math:`1 - \\text{CL}`,
    which gives the proportion of pairs where ``x`` is *lower* than ``y``.

    References
    ----------
    .. [1] Mann, H. B., & Whitney, D. R. (1947). On a test of whether one of
           two random variables is stochastically larger than the other.
           The annals of mathematical statistics, 50-60.

    .. [2] Kerby, D. S. (2014). The simple difference formula: An approach to
           teaching nonparametric correlation. Comprehensive Psychology,
           3, 11-IT.

    .. [3] McGraw, K. O., & Wong, S. P. (1992). A common language effect size
           statistic. Psychological bulletin, 111(2), 361.

    .. [4] Vargha, A., & Delaney, H. D. (2000). A Critique and Improvement of
        the “CL” Common Language Effect Size Statistics of McGraw and Wong.
        Journal of Educational and Behavioral Statistics: A Quarterly
        Publication Sponsored by the American Educational Research
        Association and the American Statistical Association, 25(2),
        101–132. https://doi.org/10.2307/1165329

    Examples
    --------
    >>> import numpy as np
    >>> import pingouin as pg
    >>> np.random.seed(123)
    >>> x = np.random.uniform(low=0, high=1, size=20)
    >>> y = np.random.uniform(low=0.2, high=1.2, size=20)
    >>> pg.mwu(x, y, tail='two-sided')
         U-val       tail    p-val    RBC    CLES
    MWU   97.0  two-sided  0.00556  0.515  0.2425

    Compare with SciPy

    >>> import scipy
    >>> scipy.stats.mannwhitneyu(x, y, use_continuity=True,
    ...                          alternative='two-sided')
    MannwhitneyuResult(statistic=97.0, pvalue=0.0055604599321374135)

    One-sided tail: one can either manually specify the alternative hypothesis

    >>> pg.mwu(x, y, tail='greater')
         U-val     tail     p-val    RBC    CLES
    MWU   97.0  greater  0.997442  0.515  0.2425

    >>> pg.mwu(x, y, tail='less')
         U-val  tail    p-val    RBC    CLES
    MWU   97.0  less  0.00278  0.515  0.7575

    Or simply leave it to Pingouin, using the `'one-sided'` argument, in which
    case Pingouin will compare the medians of ``x`` and ``y`` and select the
    most appropriate tail based on that:

    >>> # Since np.median(x) < np.median(y), this is equivalent to tail='less'
    >>> pg.mwu(x, y, tail='one-sided')
         U-val  tail    p-val    RBC    CLES
    MWU   97.0  less  0.00278  0.515  0.7575
    """
    x = np.asarray(x)
    y = np.asarray(y)

    # Remove NA
    x, y = remove_na(x, y, paired=False)

    # Check tails
    possible_tails = ['two-sided', 'one-sided', 'greater', 'less']
    assert tail in possible_tails, 'Invalid tail argument.'
    if tail == 'one-sided':
        # Detect the direction of the test based on the median
        tail = 'less' if np.median(x) < np.median(y) else 'greater'

    uval, pval = scipy.stats.mannwhitneyu(x, y, use_continuity=True,
                                          alternative=tail)

    # Effect size 1: Common Language Effect Size
    # CLES is tail-specific and calculated according to the formula given in
    # Vargha and Delaney 2000 which works with ordinal data.
    diff = x[:, None] - y
    # cles = max((diff < 0).sum(), (diff > 0).sum()) / diff.size
    # Tail = 'greater', with ties set to 0.5
    # Note that tail = 'two-sided' gives same output as tail = 'greater'
    cles = np.where(diff == 0, 0.5, diff > 0).mean()
    cles = 1 - cles if tail == 'less' else cles

    # Effect size 2: rank biserial correlation (Wendt 1972)
    rbc = 1 - (2 * uval) / diff.size  # diff.size = x.size * y.size

    # Fill output DataFrame
    stats = pd.DataFrame({
        'U-val': uval,
        'tail': tail,
        'p-val': pval,
        'RBC': rbc,
        'CLES': cles}, index=['MWU'])
    return _postprocess_dataframe(stats)
Ejemplo n.º 7
0
def friedman(data=None, dv=None, within=None, subject=None, method='chisq'):
    """Friedman test for repeated measurements.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        DataFrame. Both wide and long-format dataframe are supported for this test.
    dv : string
        Name of column containing the dependent variable (only required if ``data`` is in
        long format).
    within : string
        Name of column containing the within-subject factor (only required if ``data`` is in
        long format). Two or more within-factor are not currently supported.
    subject : string
        Name of column containing the subject/rater identifier (only required if ``data`` is in
        long format).
    method : string
        Statistical test to perform. Must be ``'chisq'`` (chi-square test) or ``'f'`` (F test).
        See notes below for explanation.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'W'``: Kendall's coefficient of concordance, corrected for ties

        If ``method='chisq'``

        * ``'Q'``: The Friedman chi-square statistic, corrected for ties
        * ``'dof'``: degrees of freedom
        * ``'p-unc'``: Uncorrected p-value of the chi squared test


        If ``method='f'``

        * ``'F'``: The Friedman F statistic, corrected for ties
        * ``'dof1'``: degrees of freedom of the numerator
        * ``'dof2'``: degrees of freedom of the denominator
        * ``'p-unc'``: Uncorrected p-value of the F test

    Notes
    -----
    The Friedman test is used for non-parametric (rank-based) one-way repeated measures ANOVA.

    It is equivalent to the test of significance of Kendalls's
    coefficient of concordance (Kendall's W). Most commonly a Q statistic,
    which has asymptotical chi-squared distribution, is computed and used for
    testing. However, the chi-squared test tend to be overly conservative for small numbers
    of samples and/or repeated measures, in which case a F-test is more adequate [1]_.

    Data can be in wide or long format. Missing values are automatically removed using a
    strict listwise approach (= complete-case analysis). In other words, any subject with one or
    more missing value(s) is completely removed from the dataframe prior to running the
    test.

    References
    ----------
    .. [1] Marozzi, M. (2014). Testing for concordance between several
           criteria. Journal of Statistical Computation and Simulation,
           84(9), 1843–1850. https://doi.org/10.1080/00949655.2013.766189

    .. [2] https://www.real-statistics.com/anova-repeated-measures/friedman-test/

    Examples
    --------
    Compute the Friedman test for repeated measurements, using a wide-format dataframe

    >>> import pandas as pd
    >>> import pingouin as pg
    >>> df = pd.DataFrame({
    ...    'white': {0: 10, 1: 8, 2: 7, 3: 9, 4: 7, 5: 4, 6: 5, 7: 6, 8: 5, 9: 10, 10: 4, 11: 7},
    ...    'red': {0: 7, 1: 5, 2: 8, 3: 6, 4: 5, 5: 7, 6: 9, 7: 6, 8: 4, 9: 6, 10: 7, 11: 3},
    ...    'rose': {0: 8, 1: 5, 2: 6, 3: 4, 4: 7, 5: 5, 6: 3, 7: 7, 8: 6, 9: 4, 10: 4, 11: 3}})
    >>> pg.friedman(df)
              Source         W  ddof1    Q     p-unc
    Friedman  Within  0.083333      2  2.0  0.367879

    Compare with SciPy

    >>> from scipy.stats import friedmanchisquare
    >>> friedmanchisquare(*df.to_numpy().T)
    FriedmanchisquareResult(statistic=1.9999999999999893, pvalue=0.3678794411714444)

    Using a long-format dataframe

    >>> df_long = df.melt(ignore_index=False).reset_index()
    >>> pg.friedman(data=df_long, dv="value", within="variable", subject="index")
                Source         W  ddof1    Q     p-unc
    Friedman  variable  0.083333      2  2.0  0.367879

    Using the F-test method

    >>> pg.friedman(df, method="f")
              Source         W     ddof1      ddof2    F     p-unc
    Friedman  Within  0.083333  1.833333  20.166667  1.0  0.378959
    """
    # Convert from wide to long-format, if needed
    if all([v is None for v in [dv, within, subject]]):
        assert isinstance(data, pd.DataFrame)
        data = data._get_numeric_data().dropna()  # Listwise deletion of missing values
        assert data.shape[0] > 2, "Data must have at least 3 non-missing rows."
        assert data.shape[1] > 1, "Data must contain at least two columns."
        data['Subj'] = np.arange(data.shape[0])
        data = data.melt(id_vars='Subj', var_name='Within', value_name='DV')
        subject, within, dv = 'Subj', 'Within', 'DV'

    # Check dataframe
    _check_dataframe(dv=dv, within=within, data=data, subject=subject, effects='within')
    assert not data[within].isnull().any(), "Cannot have missing values in `within`."
    assert not data[subject].isnull().any(), "Cannot have missing values in `subject`."

    # Pivot the table to a wide-format dataframe. This has several effects:
    # 1) Force missing values to be explicit (a NaN cell is created)
    # 2) Automatic collapsing to the mean if multiple within factors are present
    # 3) If using dropna, remove rows with missing values (listwise deletion).
    # The latter is the same behavior as JASP (= strict complete-case analysis).
    data_piv = data.pivot_table(index=subject, columns=within, values=dv, observed=True)
    data_piv = data_piv.dropna()

    # Extract data in numpy array and calculate ranks
    X = data_piv.to_numpy()
    n, k = X.shape
    ranked = scipy.stats.rankdata(X, axis=1)
    ssbn = (ranked.sum(axis=0)**2).sum()

    # Correction for ties
    ties = 0
    for i in range(n):
        replist, repnum = scipy.stats.find_repeats(X[i])
        for t in repnum:
            ties += t * (t * t - 1)

    # Compute Kendall's W corrected for ties
    W = (12 * ssbn - 3 * n**2 * k * (k + 1)**2) / (n**2 * k * (k - 1) * (k + 1) - n * ties)

    if method == 'chisq':
        # Compute the Q statistic
        Q = n * (k - 1) * W
        # Approximate the p-value
        ddof1 = k - 1
        p_unc = scipy.stats.chi2.sf(Q, ddof1)
        # Create output dataframe
        stats = pd.DataFrame({
            'Source': within, 'W': W, 'ddof1': ddof1, 'Q': Q, 'p-unc': p_unc}, index=['Friedman'])
    elif method == 'f':
        # Compute the F statistic
        F = W * (n - 1) / (1 - W)
        # Approximate the p-value
        ddof1 = k - 1 - 2 / n
        ddof2 = (n - 1) * ddof1
        p_unc = scipy.stats.f.sf(F, ddof1, ddof2)
        # Create output dataframe
        stats = pd.DataFrame({
            'Source': within, 'W': W, 'ddof1': ddof1, 'ddof2': ddof2, 'F': F, 'p-unc': p_unc},
            index=['Friedman'])
    return _postprocess_dataframe(stats)
Ejemplo n.º 8
0
def wilcoxon(x, y=None, alternative='two-sided', **kwargs):
    """
    Wilcoxon signed-rank test. It is the non-parametric version of the paired T-test.

    Parameters
    ----------
    x : array_like
        Either the first set of measurements
        (in which case y is the second set of measurements),
        or the differences between two sets of measurements
        (in which case y is not to be specified.) Must be one-dimensional.
    y : array_like
        Either the second set of measurements (if x is the first set of
        measurements), or not specified (if x is the differences between
        two sets of measurements.) Must be one-dimensional.
    alternative : string
        Defines the alternative hypothesis, or tail of the test. Must be one of
        "two-sided" (default), "greater" or "less". See :py:func:`scipy.stats.wilcoxon` for
        more details.
    **kwargs : dict
        Additional keywords arguments that are passed to :py:func:`scipy.stats.wilcoxon`.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'W-val'``: W-value
        * ``'alternative'``: tail of the test
        * ``'p-val'``: p-value
        * ``'RBC'``   : matched pairs rank-biserial correlation (effect size)
        * ``'CLES'``  : common language effect size

    See also
    --------
    scipy.stats.wilcoxon, mwu

    Notes
    -----
    The Wilcoxon signed-rank test [1]_ tests the null hypothesis that two
    related paired samples come from the same distribution. In particular,
    it tests whether the distribution of the differences x - y is symmetric
    about zero.

    .. important:: Pingouin automatically applies a continuity correction.
        Therefore, the p-values will be slightly different than
        :py:func:`scipy.stats.wilcoxon` unless ``correction=True`` is
        explicitly passed to the latter.

    In addition to the test statistic and p-values, Pingouin also computes two
    measures of effect size. The matched pairs rank biserial correlation [2]_
    is the simple difference between the proportion of favorable and
    unfavorable evidence; in the case of the Wilcoxon signed-rank test,
    the evidence consists of rank sums (Kerby 2014):

    .. math:: r = f - u

    The common language effect size is the proportion of pairs where ``x`` is
    higher than ``y``. It was first introduced by McGraw and Wong (1992) [3]_.
    Pingouin uses a brute-force version of the formula given by Vargha and
    Delaney 2000 [4]_:

    .. math:: \\text{CL} = P(X > Y) + .5 \\times P(X = Y)

    The advantage is of this method are twofold. First, the brute-force
    approach pairs each observation of ``x`` to its ``y`` counterpart, and
    therefore does not require normally distributed data. Second, the formula
    takes ties into account and therefore works with ordinal data.

    When tail is ``'less'``, the CLES is then set to :math:`1 - \\text{CL}`,
    which gives the proportion of pairs where ``x`` is *lower* than ``y``.

    References
    ----------
    .. [1] Wilcoxon, F. (1945). Individual comparisons by ranking methods.
           Biometrics bulletin, 1(6), 80-83.

    .. [2] Kerby, D. S. (2014). The simple difference formula: An approach to
           teaching nonparametric correlation. Comprehensive Psychology,
           3, 11-IT.

    .. [3] McGraw, K. O., & Wong, S. P. (1992). A common language effect size
           statistic. Psychological bulletin, 111(2), 361.

    .. [4] Vargha, A., & Delaney, H. D. (2000). A Critique and Improvement of
           the “CL” Common Language Effect Size Statistics of McGraw and Wong.
           Journal of Educational and Behavioral Statistics: A Quarterly
           Publication Sponsored by the American Educational Research
           Association and the American Statistical Association, 25(2),
           101–132. https://doi.org/10.2307/1165329

    Examples
    --------
    Wilcoxon test on two related samples.

    >>> import numpy as np
    >>> import pingouin as pg
    >>> x = np.array([20, 22, 19, 20, 22, 18, 24, 20, 19, 24, 26, 13])
    >>> y = np.array([38, 37, 33, 29, 14, 12, 20, 22, 17, 25, 26, 16])
    >>> pg.wilcoxon(x, y, alternative='two-sided')
              W-val alternative     p-val       RBC      CLES
    Wilcoxon   20.5   two-sided  0.285765 -0.378788  0.395833

    Same but using pre-computed differences. However, the CLES effect size
    cannot be computed as it requires the raw data.

    >>> pg.wilcoxon(x - y)
              W-val alternative     p-val       RBC  CLES
    Wilcoxon   20.5   two-sided  0.285765 -0.378788   NaN

    Compare with SciPy

    >>> import scipy
    >>> scipy.stats.wilcoxon(x, y)
    WilcoxonResult(statistic=20.5, pvalue=0.2661660677806492)

    The p-value is not exactly similar to Pingouin. This is because Pingouin automatically applies
    a continuity correction. Disabling it gives the same p-value as scipy:

    >>> pg.wilcoxon(x, y, alternative='two-sided', correction=False)
              W-val alternative     p-val       RBC      CLES
    Wilcoxon   20.5   two-sided  0.266166 -0.378788  0.395833

    One-sided test

    >>> pg.wilcoxon(x, y, alternative='greater')
              W-val alternative     p-val       RBC      CLES
    Wilcoxon   20.5     greater  0.876244 -0.378788  0.395833

    >>> pg.wilcoxon(x, y, alternative='less')
              W-val alternative     p-val       RBC      CLES
    Wilcoxon   20.5        less  0.142883 -0.378788  0.604167
    """
    x = np.asarray(x)
    if y is not None:
        y = np.asarray(y)
        x, y = remove_na(x, y, paired=True)  # Remove NA
    else:
        x = x[~np.isnan(x)]

    # Check tails
    assert alternative in ['two-sided', 'greater', 'less'], (
        "Alternative must be one of 'two-sided' (default), 'greater' or 'less'.")
    if "tail" in kwargs:
        raise ValueError(
            "Since Pingouin 0.4.0, the 'tail' argument has been renamed to 'alternative'.")

    # Compute test
    if "correction" not in kwargs:
        kwargs["correction"] = True
    wval, pval = scipy.stats.wilcoxon(x=x, y=y, alternative=alternative, **kwargs)

    # Effect size 1: Common Language Effect Size
    # Since Pingouin v0.3.5, CLES is tail-specific and calculated
    # according to the formula given in Vargha and Delaney 2000 which
    # works with ordinal data.
    if y is not None:
        diff = x[:, None] - y
        # cles = max((diff < 0).sum(), (diff > 0).sum()) / diff.size
        # alternative = 'greater', with ties set to 0.5
        # Note that alternative = 'two-sided' gives same output as alternative = 'greater'
        cles = np.where(diff == 0, 0.5, diff > 0).mean()
        cles = 1 - cles if alternative == 'less' else cles
    else:
        # CLES cannot be computed if y is None
        cles = np.nan

    # Effect size 2: matched-pairs rank biserial correlation (Kerby 2014)
    if y is not None:
        d = x - y
        d = d[d != 0]
    else:
        d = x[x != 0]
    r = scipy.stats.rankdata(abs(d))
    rsum = r.sum()
    r_plus = np.sum((d > 0) * r)
    r_minus = np.sum((d < 0) * r)
    rbc = r_plus / rsum - r_minus / rsum

    # Fill output DataFrame
    stats = pd.DataFrame({
        'W-val': wval,
        'alternative': alternative,
        'p-val': pval,
        'RBC': rbc,
        'CLES': cles}, index=['Wilcoxon'])
    return _postprocess_dataframe(stats)
Ejemplo n.º 9
0
def friedman(data=None, dv=None, within=None, subject=None):
    """Friedman test for repeated measurements.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        DataFrame
    dv : string
        Name of column containing the dependent variable.
    within : string
        Name of column containing the within-subject factor.
    subject : string
        Name of column containing the subject identifier.

    Returns
    -------
    stats : :py:class:`pandas.DataFrame`

        * ``'Q'``: The Friedman Q statistic, corrected for ties
        * ``'p-unc'``: Uncorrected p-value
        * ``'dof'``: degrees of freedom

    Notes
    -----
    The Friedman test is used for one-way repeated measures ANOVA by ranks.

    Data are expected to be in long-format.

    Note that if the dataset contains one or more other within subject
    factors, an automatic collapsing to the mean is applied on the dependent
    variable (same behavior as the ezANOVA R package). As such, results can
    differ from those of JASP. If you can, always double-check the results.

    Due to the assumption that the test statistic has a chi squared
    distribution, the p-value is only reliable for n > 10 and more than 6
    repeated measurements.

    NaN values are automatically removed.

    Examples
    --------
    Compute the Friedman test for repeated measurements.

    >>> from pingouin import friedman, read_dataset
    >>> df = read_dataset('rm_anova')
    >>> friedman(data=df, dv='DesireToKill', within='Disgustingness',
    ...          subject='Subject')
                      Source  ddof1         Q     p-unc
    Friedman  Disgustingness      1  9.227848  0.002384
    """
    # Check data
    _check_dataframe(dv=dv,
                     within=within,
                     data=data,
                     subject=subject,
                     effects='within')

    # Convert Categorical columns to string
    # This is important otherwise all the groupby will return different results
    # unless we specify .groupby(..., observed = True).
    for c in [subject, within]:
        if data[c].dtype.name == 'category':
            data[c] = data[c].astype(str)

    # Collapse to the mean
    data = data.groupby([subject, within]).mean().reset_index()

    # Remove NaN
    if data[dv].isnull().any():
        data = remove_rm_na(dv=dv,
                            within=within,
                            subject=subject,
                            data=data[[subject, within, dv]])

    # Extract number of groups and total sample size
    grp = data.groupby(within)[dv]
    rm = list(data[within].unique())
    k = len(rm)
    X = np.array([grp.get_group(r).to_numpy() for r in rm]).T
    n = X.shape[0]

    # Rank per subject
    ranked = np.zeros(X.shape)
    for i in range(n):
        ranked[i] = scipy.stats.rankdata(X[i, :])

    ssbn = (ranked.sum(axis=0)**2).sum()

    # Compute the test statistic
    Q = (12 / (n * k * (k + 1))) * ssbn - 3 * n * (k + 1)

    # Correct for ties
    ties = 0
    for i in range(n):
        replist, repnum = scipy.stats.find_repeats(X[i])
        for t in repnum:
            ties += t * (t * t - 1)

    c = 1 - ties / float(k * (k * k - 1) * n)
    Q /= c

    # Approximate the p-value
    ddof1 = k - 1
    p_unc = scipy.stats.chi2.sf(Q, ddof1)

    # Create output dataframe
    stats = pd.DataFrame(
        {
            'Source': within,
            'ddof1': ddof1,
            'Q': Q,
            'p-unc': p_unc,
        },
        index=['Friedman'])
    return _postprocess_dataframe(stats)
Ejemplo n.º 10
0
def chi2_mcnemar(data, x, y, correction=True):
    """
    Performs the exact and approximated versions of McNemar's test.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        The dataframe containing the ocurrences for the test. Each row must
        represent either a subject or a pair of subjects.
    x, y : string
        The variables names for the McNemar's test. Must be names of columns
        in ``data``.

        If each row of ``data`` represents a subject, then ``x`` and ``y`` must
        be columns containing dichotomous measurements in two different
        contexts. For instance: the presence of pain before and after a certain
        treatment.

        If each row of ``data`` represents a pair of subjects, then ``x`` and
        ``y`` must be columns containing dichotomous measurements for each of
        the subjects. For instance: a positive response to a certain drug in
        the control group and in the test group, supposing that each pair
        contains a subject in each group.

        The 2x2 crosstab is created using the
        :py:func:`pingouin.dichotomous_crosstab` function.

        .. warning:: Missing values are not allowed.

    correction : bool
        Whether to apply the correction for continuity (Edwards, A. 1948).

    Returns
    -------
    observed : :py:class:`pandas.DataFrame`
        The observed contingency table of frequencies.
    stats : :py:class:`pandas.DataFrame`
        The test summary:

        * ``'chi2'``: The test statistic
        * ``'dof'``: The degree of freedom
        * ``'p-approx'``: The approximated p-value
        * ``'p-exact'``: The exact p-value

    Notes
    -----
    The McNemar's test is compatible with dichotomous paired data, generally
    used to assert the effectiveness of a certain procedure, such as a
    treatment or the use of a drug. "Dichotomous" means that the values of the
    measurements are binary. "Paired data" means that each measurement is done
    twice, either on the same subject in two different moments or in two
    similar (paired) subjects from different groups (e.g.: control/test). In
    order to better understand the idea behind McNemar's test, let's illustrate
    it with an example.

    Suppose that we wanted to compare the effectiveness of two different
    treatments (X and Y) for athlete's foot on a certain group of `n` people.
    To achieve this, we measured their responses to such treatments on each
    foot. The observed data summary was:

    * Number of people with good responses to X and Y: `a`
    * Number of people with good response to X and bad response to Y: `b`
    * Number of people with bad response to X and good response to Y: `c`
    * Number of people with bad responses to X and Y: `d`

    Now consider the two groups:

    1. The group of people who had good response to X (`a` + `b` subjects)
    2. The group of people who had good response to Y (`a` + `c` subjects)

    If the treatments have the same effectiveness, we should expect the
    probabilities of having good responses to be the same, regardless of the
    treatment. Mathematically, such statement can be translated into the
    following equation:

    .. math::

        \\frac{a+b}{n} = \\frac{a+c}{n} \\Rightarrow b = c

    Thus, this test should indicate higher statistical significances for higher
    distances between `b` and `c` (McNemar, Q. 1947):

    .. math::

        \\chi^2 = \\frac{(b - c)^2}{b + c}

    References
    ----------
    * Edwards, A. L. (1948). Note on the "correction for continuity" in
      testing the significance of the difference between correlated
      proportions. Psychometrika, 13(3), 185-187.

    * McNemar, Q. (1947). Note on the sampling error of the difference
      between correlated proportions or percentages. Psychometrika, 12(2),
      153-157.

    Examples
    --------
    >>> import pingouin as pg
    >>> data = pg.read_dataset('chi2_mcnemar')
    >>> observed, stats = pg.chi2_mcnemar(data, 'treatment_X', 'treatment_Y')
    >>> observed
    treatment_Y   0   1
    treatment_X
    0            20  40
    1             8  12

    In this case, `c` (40) seems to be a significantly greater than `b` (8).
    The McNemar test should be sensitive to this.

    >>> stats
                chi2  dof  p-approx   p-exact
    mcnemar  20.020833    1  0.000008  0.000003
    """
    # Python code initially inspired by statsmodel's mcnemar
    assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.'
    assert all(isinstance(column, str) for column in (x, y)),\
        'procedures must contain strings, only.'
    assert all(column in data.columns for column in (x, y)),\
        'columns are not in dataframe.'

    for column in (x, y):
        if data[column].isna().any():
            raise ValueError('Null values are not allowed.')

    observed = dichotomous_crosstab(data, x, y)
    # Careful, the order of b and c is inverted compared to wikipedia
    # because the colums / rows of the crosstab is [0, 1] and not [1, 0].
    c, b = observed.at[0, 1], observed.at[1, 0]
    n_discordants = b + c

    if (b, c) == (0, 0):
        raise ValueError('McNemar\'s test does not work if the secondary ' +
                         'diagonal of the observed data summary does not ' +
                         'have values different from 0.')

    chi2 = (abs(b - c) - int(correction))**2 / n_discordants
    pexact = min(1, 2 * binom.cdf(min(b, c), n_discordants, 0.5))
    stats = {
        'chi2': chi2,
        'dof': 1,
        'p-approx': sp_chi2.sf(chi2, 1),
        'p-exact': pexact,
        # 'p-mid': pexact - binom.pmf(b, n_discordants, 0.5)
    }

    stats = pd.DataFrame(stats, index=['mcnemar'])

    return observed, _postprocess_dataframe(stats)
Ejemplo n.º 11
0
def chi2_independence(data, x, y, correction=True):
    """
    Chi-squared independence tests between two categorical variables.

    The test is computed for different values of :math:`\\lambda`: 1, 2/3, 0,
    -1/2, -1 and -2 (Cressie and Read, 1984).

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        The dataframe containing the ocurrences for the test.
    x, y : string
        The variables names for the Chi-squared test. Must be names of columns
        in ``data``.
    correction : bool
        Whether to apply Yates' correction when the degree of freedom of the
        observed contingency table is 1 (Yates 1934).

    Returns
    -------
    expected : :py:class:`pandas.DataFrame`
        The expected contingency table of frequencies.
    observed : :py:class:`pandas.DataFrame`
        The (corrected or not) observed contingency table of frequencies.
    stats : :py:class:`pandas.DataFrame`
        The test summary, containing four columns:

        * ``'test'``: The statistic name
        * ``'lambda'``: The :math:`\\lambda` value used for the power\
                        divergence statistic
        * ``'chi2'``: The test statistic
        * ``'pval'``: The p-value of the test
        * ``'cramer'``: The Cramer's V effect size
        * ``'power'``: The statistical power of the test

    Notes
    -----
    From Wikipedia:

    *The chi-squared test is used to determine whether there is a significant
    difference between the expected frequencies and the observed frequencies
    in one or more categories.*

    As application examples, this test can be used to *i*) evaluate the
    quality of a categorical variable in a classification problem or to *ii*)
    check the similarity between two categorical variables. In the first
    example, a good categorical predictor and the class column should present
    high :math:`\\chi^2` and low p-value. In the second example, similar
    categorical variables should present low :math:`\\chi^2` and high p-value.

    This function is a wrapper around the
    :py:func:`scipy.stats.power_divergence` function.

    .. warning :: As a general guideline for the consistency of this test, the
        observed and the expected contingency tables should not have cells
        with frequencies lower than 5.

    References
    ----------
    * Cressie, N., & Read, T. R. (1984). Multinomial goodness‐of‐fit
      tests. Journal of the Royal Statistical Society: Series B
      (Methodological), 46(3), 440-464.

    * Yates, F. (1934). Contingency Tables Involving Small Numbers and the
      :math:`\\chi^2` Test. Supplement to the Journal of the Royal
      Statistical Society, 1, 217-235.

    Examples
    --------
    Let's see if gender is a good categorical predictor for the presence of
    heart disease.

    >>> import pingouin as pg
    >>> data = pg.read_dataset('chi2_independence')
    >>> data['sex'].value_counts(ascending=True)
    0     96
    1    207
    Name: sex, dtype: int64

    If gender is not a good predictor for heart disease, we should expect the
    same 96:207 ratio across the target classes.

    >>> expected, observed, stats = pg.chi2_independence(data, x='sex',
    ...                                                  y='target')
    >>> expected
    target          0           1
    sex
    0       43.722772   52.277228
    1       94.277228  112.722772

    Let's see what the data tells us.

    >>> observed
    target      0     1
    sex
    0        24.5  71.5
    1       113.5  93.5

    The proportion is lower on the class 0 and higher on the class 1. The
    tests should be sensitive to this difference.

    >>> stats.round(3)
                     test  lambda    chi2  dof  pval  cramer  power
    0             pearson   1.000  22.717  1.0   0.0   0.274  0.997
    1        cressie-read   0.667  22.931  1.0   0.0   0.275  0.998
    2      log-likelihood   0.000  23.557  1.0   0.0   0.279  0.998
    3       freeman-tukey  -0.500  24.220  1.0   0.0   0.283  0.998
    4  mod-log-likelihood  -1.000  25.071  1.0   0.0   0.288  0.999
    5              neyman  -2.000  27.458  1.0   0.0   0.301  0.999

    Very low p-values indeed. The gender qualifies as a good predictor for the
    presence of heart disease on this dataset.
    """
    # Python code inspired by SciPy's chi2_contingency
    assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.'
    assert isinstance(x, str), 'x must be a string.'
    assert isinstance(y, str), 'y must be a string.'
    assert all(col in data.columns for col in (x, y)),\
        'columns are not in dataframe.'
    assert isinstance(correction, bool), 'correction must be a boolean.'

    observed = pd.crosstab(data[x], data[y])

    if observed.size == 0:
        raise ValueError('No data; observed has size 0.')

    expected = pd.DataFrame(expected_freq(observed),
                            index=observed.index,
                            columns=observed.columns)

    # All count frequencies should be at least 5
    for df, name in zip([observed, expected], ['observed', 'expected']):
        if (df < 5).any(axis=None):
            warnings.warn('Low count on {} frequencies.'.format(name))

    dof = float(expected.size - sum(expected.shape) + expected.ndim - 1)

    if dof == 1 and correction:
        # Adjust `observed` according to Yates' correction for continuity.
        observed = observed + 0.5 * np.sign(expected - observed)

    ddof = observed.size - 1 - dof
    n = data.shape[0]
    stats = []
    names = [
        "pearson", "cressie-read", "log-likelihood", "freeman-tukey",
        "mod-log-likelihood", "neyman"
    ]

    for name, lambda_ in zip(names, [1.0, 2 / 3, 0.0, -1 / 2, -1.0, -2.0]):
        if dof == 0:
            chi2, p, cramer, power = 0.0, 1.0, np.nan, np.nan
        else:
            chi2, p = power_divergence(observed,
                                       expected,
                                       ddof=ddof,
                                       axis=None,
                                       lambda_=lambda_)
            dof_cramer = min(expected.shape) - 1
            cramer = np.sqrt(chi2 / (n * dof_cramer))
            power = power_chi2(dof=dof, w=cramer, n=n, alpha=0.05)

        stats.append({
            'test': name,
            'lambda': lambda_,
            'chi2': chi2,
            'dof': dof,
            'pval': p,
            'cramer': cramer,
            'power': power
        })

    stats = pd.DataFrame(stats)[[
        'test', 'lambda', 'chi2', 'dof', 'pval', 'cramer', 'power'
    ]]
    return expected, observed, _postprocess_dataframe(stats)