Example #1
0
 def test_anova(self):
     """Test function anova."""
     # Pain dataset
     df_pain = read_dataset('anova')
     aov = anova(dv='Pain threshold',
                 between='Hair color',
                 data=df_pain,
                 detailed=True,
                 export_filename='test_export.csv')
     anova(dv='Pain threshold', between=['Hair color'], data=df_pain)
     # Compare with JASP
     assert np.allclose(aov.loc[0, 'F'], 6.791)
     assert np.allclose(np.round(aov.loc[0, 'p-unc'], 3), .004)
     assert np.allclose(aov.loc[0, 'np2'], .576)
     # Two-way ANOVA
     anova(dv='Scores',
           between=['Group', 'Time'],
           data=df,
           export_filename='test_export.csv')
     anova2(dv='Scores', between=['Group', 'Time'], data=df)
     anova2(dv='Scores', between=['Group'], data=df)
     anova2(dv='Scores', between='Group', data=df)
Example #2
0
def _anova(self,
           dv=None,
           between=None,
           ss_type=2,
           detailed=False,
           export_filename=None):
    """Return one-way and two-way ANOVA."""
    aov = anova(data=self,
                dv=dv,
                between=between,
                ss_type=ss_type,
                detailed=detailed,
                export_filename=export_filename)
    return aov
Example #3
0
    def test_anova(self):
        """Test function anova.
        Compare results to JASP.
        """
        # Pain dataset
        df_pain = read_dataset('anova')
        anova(dv='Pain threshold', between=['Hair color'], data=df_pain)
        # Compare with JASP
        aov = anova(dv='Pain threshold',
                    between='Hair color',
                    data=df_pain,
                    detailed=True).round(3)
        assert aov.at[0, 'F'] == 6.791
        assert aov.at[0, 'p-unc'] == .004
        assert aov.at[0, 'np2'] == .576
        aov = anova(dv='Pain threshold',
                    between='Hair color',
                    data=df_pain,
                    effsize="n2",
                    detailed=True).round(3)
        assert aov.at[0, 'n2'] == .576

        # Unbalanced and with missing values
        df_pain.loc[[17, 18], 'Pain threshold'] = np.nan
        aov = df_pain.anova(dv='Pain threshold', between='Hair color').round(3)
        assert aov.at[0, 'ddof1'] == 3
        assert aov.at[0, 'ddof2'] == 13
        assert aov.at[0, 'F'] == 4.359
        assert aov.at[0, 'p-unc'] == 0.025
        assert aov.at[0, 'np2'] == 0.501
        # Error: between is an empty list
        with pytest.raises(ValueError):
            anova(dv='Pain threshold', between=[], data=df_pain)

        # Unbalanced and with missing values AND between as a categorical
        df_paincat = df_pain.copy()
        df_paincat['Hair color'] = df_paincat['Hair color'].astype('category')
        df_paincat['Hair color'].cat.add_categories('Bald', inplace=True)
        aov = df_paincat.anova(dv='Pain threshold',
                               between='Hair color').round(3)
        assert aov.at[0, 'ddof1'] == 3
        assert aov.at[0, 'ddof2'] == 13
        assert aov.at[0, 'F'] == 4.359
        assert aov.at[0, 'p-unc'] == 0.025
        assert aov.at[0, 'np2'] == 0.501

        # Two-way ANOVA with balanced design
        df_aov2 = read_dataset('anova2')
        aov2 = anova(dv="Yield", between=["Blend", "Crop"],
                     data=df_aov2).round(4)
        array_equal(aov2.loc[:, 'MS'],
                    [2.0417, 1368.2917, 1180.0417, 541.8472])
        array_equal(aov2.loc[[0, 1, 2], 'F'], [0.0038, 2.5252, 2.1778])
        array_equal(aov2.loc[[0, 1, 2], 'p-unc'], [0.9517, 0.1080, 0.1422])
        array_equal(aov2.loc[[0, 1, 2], 'np2'], [0.0002, 0.2191, 0.1948])
        # Same but with standard eta-square
        aov2 = anova(dv="Yield",
                     between=["Blend", "Crop"],
                     data=df_aov2,
                     effsize="n2").round(4)
        array_equal(aov2.loc[[0, 1, 2], 'n2'], [0.0001, 0.1843, 0.1589])

        # Two-way ANOVA with unbalanced design
        df_aov2 = read_dataset('anova2_unbalanced')
        aov2 = df_aov2.anova(dv="Scores", between=["Diet",
                                                   "Exercise"]).round(3)
        array_equal(aov2.loc[:, 'MS'], [390.625, 180.625, 15.625, 52.625])
        array_equal(aov2.loc[[0, 1, 2], 'F'], [7.423, 3.432, 0.297])
        array_equal(aov2.loc[[0, 1, 2], 'p-unc'], [0.034, 0.113, 0.605])
        array_equal(aov2.loc[[0, 1, 2], 'np2'], [0.553, 0.364, 0.047])

        # Two-way ANOVA with unbalanced design and missing values
        df_aov2.loc[9, 'Scores'] = np.nan
        # Type 2
        aov2 = anova(dv="Scores", between=["Diet", "Exercise"],
                     data=df_aov2).round(3)
        array_equal(aov2.loc[[0, 1, 2], 'F'], [10.403, 5.167, 0.761])
        array_equal(aov2.loc[[0, 1, 2], 'p-unc'], [0.023, 0.072, 0.423])
        array_equal(aov2.loc[[0, 1, 2], 'np2'], [0.675, 0.508, 0.132])
        # Type 1
        aov2_ss1 = anova(dv="Scores",
                         between=["Diet", "Exercise"],
                         ss_type=1,
                         data=df_aov2).round(3)
        assert not aov2.equals(aov2_ss1)

        # Three-way ANOVA using statsmodels
        # Balanced
        df_aov3 = read_dataset('anova3')
        aov3_ss1 = anova(dv="Cholesterol",
                         between=['Sex', 'Risk', 'Drug'],
                         ss_type=1,
                         data=df_aov3).round(3)
        aov3_ss2 = anova(dv="Cholesterol",
                         between=['Sex', 'Risk', 'Drug'],
                         ss_type=2,
                         data=df_aov3).round(3)
        aov3_ss3 = anova(dv="Cholesterol",
                         between=['Sex', 'Risk', 'Drug'],
                         ss_type=3,
                         data=df_aov3).round(3)
        # Check that type 1 == type 2 == type 3
        assert aov3_ss1.equals(aov3_ss2)
        assert aov3_ss2.equals(aov3_ss3)
        # Compare with JASP
        array_equal(aov3_ss1.loc[:, 'F'],
                    [2.462, 13.449, 0.484, 0.139, 1.522, 1.446, 1.094, np.nan])
        array_equal(aov3_ss1.loc[:, 'np2'],
                    [0.049, 0.219, 0.020, 0.003, 0.060, 0.057, 0.044, np.nan])
        array_equal(aov3_ss1.loc[:, 'p-unc'],
                    [0.123, 0.001, 0.619, 0.711, 0.229, 0.245, 0.343, np.nan])
        # Unbalanced
        df_aov3 = read_dataset('anova3_unbalanced')
        aov3_ss1 = anova(dv="Cholesterol",
                         between=['Sex', 'Risk', 'Drug'],
                         ss_type=1,
                         data=df_aov3).round(3)
        aov3_ss2 = anova(dv="Cholesterol",
                         between=['Sex', 'Risk', 'Drug'],
                         ss_type=2,
                         data=df_aov3).round(3)
        aov3_ss3 = anova(dv="Cholesterol",
                         between=['Sex', 'Risk', 'Drug'],
                         ss_type=3,
                         data=df_aov3).round(3)
        # Compare with JASP
        # Type 1
        array_equal(aov3_ss1.loc[:, 'F'],
                    [4.155, 15.166, 0.422, 0.085, 0.859, 1.170, 0.505, np.nan])
        array_equal(aov3_ss1.loc[:, 'np2'],
                    [0.068, 0.210, 0.015, 0.001, 0.029, 0.039, 0.017, np.nan])
        array_equal(aov3_ss1.loc[:, 'p-unc'],
                    [0.046, 0., 0.658, 0.772, 0.429, 0.318, 0.606, np.nan])
        array_equal(aov3_ss1.loc[:, 'Source'], [
            'Sex', 'Risk', 'Drug', 'Sex * Risk', 'Sex * Drug', 'Risk * Drug',
            'Sex * Risk * Drug', 'Residual'
        ])
        # Type 2
        array_equal(aov3_ss2.loc[:, 'F'],
                    [3.759, 15.169, 0.429, 0.099, 0.739, 1.170, 0.505, np.nan])
        array_equal(aov3_ss2.loc[:, 'np2'],
                    [0.062, 0.210, 0.015, 0.002, 0.025, 0.039, 0.017, np.nan])
        array_equal(aov3_ss2.loc[:, 'p-unc'],
                    [0.057, 0., 0.653, 0.754, 0.482, 0.318, 0.606, np.nan])

        # Type 3
        array_equal(aov3_ss3.loc[:, 'F'],
                    [3.910, 15.555, 0.484, 0.079, 0.750, 1.060, 0.505, np.nan])
        array_equal(aov3_ss3.loc[:, 'np2'],
                    [0.064, 0.214, 0.017, 0.001, 0.026, 0.036, 0.017, np.nan])
        array_equal(aov3_ss3.loc[:, 'p-unc'],
                    [0.053, 0., 0.619, 0.779, 0.477, 0.353, 0.606, np.nan])

        aov3_ss3 = anova(dv="Cholesterol",
                         between=['Sex', 'Risk', 'Drug'],
                         ss_type=3,
                         data=df_aov3,
                         effsize="n2").round(3)
        array_equal(aov3_ss3.loc[:, 'n2'],
                    [0.048, 0.189, 0.012, 0.001, 0.018, 0.026, 0.012, np.nan])

        # Error: invalid char in column names
        df_aov3['Sex:'] = np.random.normal(size=df_aov3.shape[0])
        with pytest.raises(ValueError):
            anova(dv='Cholesterol',
                  between=['Sex:', 'Risk', 'Drug'],
                  data=df_aov3)
Example #4
0
def pairwise_tukey(dv=None,
                   between=None,
                   data=None,
                   alpha=.05,
                   tail='two-sided',
                   effsize='hedges'):
    '''Pairwise Tukey-HSD post-hoc test.

    Parameters
    ----------
    dv : string
        Name of column containing the dependant variable.
    between: string
        Name of column containing the between factor.
    data : pandas DataFrame
        DataFrame
    alpha : float
        Significance level
    tail : string
        Indicates whether to return the 'two-sided' or 'one-sided' p-values
    effsize : string or None
        Effect size type. Available methods are ::

        'none' : no effect size
        'cohen' : Unbiased Cohen d
        'hedges' : Hedges g
        'glass': Glass delta
        'eta-square' : Eta-square
        'odds-ratio' : Odds ratio
        'AUC' : Area Under the Curve

    Returns
    -------
    stats : DataFrame
        Stats summary ::

        'A' : Name of first measurement
        'B' : Name of second measurement
        'mean(A)' : Mean of first measurement
        'mean(B)' : Mean of second measurement
        'diff' : Mean difference
        'SE' : Standard error
        'tail' : indicate whether the p-values are one-sided or two-sided
        'T' : T-values
        'p-tukey' : Tukey-HSD corrected p-values
        'efsize' : effect sizes
        'eftype' : type of effect size

    Notes
    -----
    Tukey HSD post-hoc is best for balanced one-way ANOVA.
    It has been proven to be conservative for one-way ANOVA with unequal
    sample sizes. However, it is not robust if the groups have unequal
    variances, in which case the Games-Howell test is more adequate.
    Tukey HSD is not valid for repeated measures ANOVA.

    Note that when the sample sizes are unequal, this function actually
    performs the Tukey-Kramer test (which allows for unequal sample sizes).

    The T-values are defined as:

    .. math::

        t = \dfrac{\overline{x}_i - \overline{x}_j}{\sqrt{2 \cdot MS_w / n}}

    where :math:`\overline{x}_i` and :math:`\overline{x}_j` are the means of
    the first and second group, respectively, :math:`MS_w` the mean squares of
    the error (computed using ANOVA) and :math:`n` the sample size.

    If the sample sizes are unequal, the Tukey-Kramer procedure is
    automatically used:

    .. math::

        t = \dfrac{\overline{x}_i - \overline{x}_j}{\sqrt{\dfrac{MS_w}{n_i}
        + \dfrac{MS_w}{n_j}}}

    where :math:`n_i` and :math:`n_j` are the sample sizes of the first and
    second group, respectively.

    The p-values are then approximated using the Studentized range distribution
    :math:`Q(\sqrt2*|t_i|, r, N - r)` where :math:`r` is the total number of
    groups and :math:`N` is the total sample size.

    Note that the p-values might be slightly different than those obtained
    using R or Matlab since the studentized range approximation is done using
    the Gleason (1999) algorithm, which is more efficient and accurate than
    the algorithms used in Matlab or R.

    References
    ----------
    .. [1] Tukey, John W. "Comparing individual means in the analysis of
           variance." Biometrics (1949): 99-114.

    .. [2] Gleason, John R. "An accurate, non-iterative approximation for
           studentized range quantiles." Computational statistics & data
           analysis 31.2 (1999): 147-158.

    Examples
    --------
    Pairwise Tukey post-hocs on the pain threshold dataset.

        >>> from pingouin import pairwise_tukey
        >>> from pingouin.datasets import read_dataset
        >>> df = read_dataset('anova')
        >>> pairwise_tukey(dv='Pain threshold', between='Hair color', data=df)
    '''
    from pingouin.external.qsturng import psturng

    # First compute the ANOVA
    aov = anova(dv=dv, data=data, between=between, detailed=True)
    df = aov.loc[1, 'DF']
    ng = aov.loc[0, 'DF'] + 1
    grp = data.groupby(between)[dv]
    n = grp.count().values
    gmeans = grp.mean().values
    gvar = aov.loc[1, 'MS'] / n

    # Pairwise combinations
    g1, g2 = np.array(list(combinations(np.arange(ng), 2))).T
    mn = gmeans[g1] - gmeans[g2]
    se = np.sqrt(gvar[g1] + gvar[g2])
    tval = mn / se

    # Critical values and p-values
    # from pingouin.external.qsturng import qsturng
    # crit = qsturng(1 - alpha, ng, df) / np.sqrt(2)
    pval = psturng(np.sqrt(2) * np.abs(tval), ng, df)
    pval *= 0.5 if tail == 'one-sided' else 1

    # Uncorrected p-values
    # from scipy.stats import t
    # punc = t.sf(np.abs(tval), n[g1].size + n[g2].size - 2) * 2

    # Effect size
    d = tval * np.sqrt(1 / n[g1] + 1 / n[g2])
    ef = convert_effsize(d, 'cohen', effsize, n[g1], n[g2])

    # Create dataframe
    # Careful: pd.unique does NOT sort whereas numpy does
    stats = pd.DataFrame({
        'A': np.unique(data[between])[g1],
        'B': np.unique(data[between])[g2],
        'mean(A)': gmeans[g1],
        'mean(B)': gmeans[g2],
        'diff': mn,
        'SE': np.round(se, 3),
        'tail': tail,
        'T': np.round(tval, 3),
        # 'alpha': alpha,
        # 'crit': np.round(crit, 3),
        'p-tukey': pval,
        'efsize': np.round(ef, 3),
        'eftype': effsize,
    })
    return stats
Example #5
0
 def test_anova(self):
     """Test function anova.
     Compare results to JASP.
     """
     # Pain dataset
     df_pain = read_dataset('anova')
     aov = anova(dv='Pain threshold',
                 between='Hair color',
                 data=df_pain,
                 detailed=True,
                 export_filename='test_export.csv')
     anova(dv='Pain threshold', between=['Hair color'], data=df_pain)
     # Compare with JASP
     assert np.allclose(aov.loc[0, 'F'], 6.791)
     assert np.allclose(np.round(aov.loc[0, 'p-unc'], 3), .004)
     assert np.allclose(aov.loc[0, 'np2'], .576)
     # Unbalanced and with missing values
     df_pain.loc[[17, 18], 'Pain threshold'] = np.nan
     aov = df_pain.anova(dv='Pain threshold', between='Hair color').round(3)
     assert aov.loc[0, 'ddof1'] == 3
     assert aov.loc[0, 'ddof2'] == 13
     assert aov.loc[0, 'F'] == 4.359
     assert aov.loc[0, 'p-unc'] == 0.025
     assert aov.loc[0, 'np2'] == 0.501
     # Two-way ANOVA with balanced design
     df_aov2 = read_dataset('anova2')
     aov2 = anova(dv="Yield", between=["Blend", "Crop"],
                  data=df_aov2).round(3)
     assert aov2.loc[0, 'MS'] == 2.042
     assert aov2.loc[1, 'MS'] == 1368.292
     assert aov2.loc[2, 'MS'] == 1180.042
     assert aov2.loc[3, 'MS'] == 541.847
     assert aov2.loc[0, 'F'] == 0.004
     assert aov2.loc[1, 'F'] == 2.525
     assert aov2.loc[2, 'F'] == 2.178
     assert aov2.loc[0, 'p-unc'] == 0.952
     assert aov2.loc[1, 'p-unc'] == 0.108
     assert aov2.loc[2, 'p-unc'] == 0.142
     assert aov2.loc[0, 'np2'] == 0.000
     assert aov2.loc[1, 'np2'] == 0.219
     assert aov2.loc[2, 'np2'] == 0.195
     # Two-way ANOVA with unbalanced design
     df_aov2 = read_dataset('anova2_unbalanced')
     aov2 = df_aov2.anova(dv="Scores",
                          export_filename='test_export.csv',
                          between=["Diet", "Exercise"]).round(3)
     assert aov2.loc[0, 'MS'] == 390.625
     assert aov2.loc[1, 'MS'] == 180.625
     assert aov2.loc[2, 'MS'] == 15.625
     assert aov2.loc[3, 'MS'] == 52.625
     assert aov2.loc[0, 'F'] == 7.423
     assert aov2.loc[1, 'F'] == 3.432
     assert aov2.loc[2, 'F'] == 0.297
     assert aov2.loc[0, 'p-unc'] == 0.034
     assert aov2.loc[1, 'p-unc'] == 0.113
     assert aov2.loc[2, 'p-unc'] == 0.605
     assert aov2.loc[0, 'np2'] == 0.553
     assert aov2.loc[1, 'np2'] == 0.364
     assert aov2.loc[2, 'np2'] == 0.047
     # Two-way ANOVA with unbalanced design and missing values
     df_aov2.loc[9, 'Scores'] = np.nan
     aov2 = anova(dv="Scores", between=["Diet", "Exercise"],
                  data=df_aov2).round(3)
     assert aov2.loc[0, 'F'] == 10.403
     assert aov2.loc[1, 'F'] == 5.167
     assert aov2.loc[2, 'F'] == 0.761
     assert aov2.loc[0, 'p-unc'] == 0.023
     assert aov2.loc[1, 'p-unc'] == 0.072
     assert aov2.loc[2, 'p-unc'] == 0.423
     assert aov2.loc[0, 'np2'] == 0.675
     assert aov2.loc[1, 'np2'] == 0.508
     assert aov2.loc[2, 'np2'] == 0.132