Ejemplo n.º 1
0
    def test_logistic_regression(self):
        """Test function logistic_regression."""

        # Simple regression
        lom = logistic_regression(df['X'], df['Ybin'], as_dataframe=False)
        # Compare to JASP
        assert_equal(np.round(lom['coef'], 1), [1.3, -0.2])
        assert_equal(np.round(lom['se'], 2), [0.76, 0.12])
        assert_equal(np.round(lom['z'], 1), [1.7, -1.6])
        assert_equal(np.round(lom['pval'], 1), [0.1, 0.1])
        assert_equal(np.round(lom['CI[2.5%]'], 1), [-.2, -.4])
        assert_equal(np.round(lom['CI[97.5%]'], 1), [2.8, 0.0])

        # Multiple predictors
        X = df[['X', 'M']].values
        y = df['Ybin'].values
        lom = logistic_regression(X, y)  # Pingouin
        # Compare against JASP
        assert_equal(np.round(lom['coef'].values, 1), [1.3, -0.2, -0.0])
        assert_equal(np.round(lom['se'].values, 2), [0.78, 0.14, 0.13])
        assert_equal(np.round(lom['z'].values, 1), [1.7, -1.4, -0.1])
        assert_equal(np.round(lom['pval'].values, 1), [0.1, 0.2, 1.])
        assert_equal(np.round(lom['CI[2.5%]'].values, 1), [-.2, -.5, -.3])
        assert_equal(np.round(lom['CI[97.5%]'].values, 1), [2.8, 0.1, 0.2])

        # Test other arguments
        c = logistic_regression(df[['X', 'M']], df['Ybin'], coef_only=True)
        assert_equal(np.round(c, 1), [1.3, -0.2, -0.0])

        # Test **kwargs
        logistic_regression(X, y, solver='sag', C=10, max_iter=10000)

        with pytest.raises(ValueError):
            y[3] = 2
            logistic_regression(X, y)
Ejemplo n.º 2
0
    def test_logistic_regression(self):
        """Test function logistic_regression."""

        # Simple regression
        lom = logistic_regression(df['X'], df['Ybin'], as_dataframe=False)
        # Compare to R
        # Reproduce in jupyter notebook with rpy2 using
        # %load_ext rpy2.ipython (in a separate cell)
        # Together in one cell below
        # %%R -i df
        # summary(glm(Ybin ~ X, data=df, family=binomial))
        assert_equal(np.round(lom['coef'], 4), [1.3191, -0.1995])
        assert_equal(np.round(lom['se'], 4), [0.7582, 0.1211])
        assert_equal(np.round(lom['z'], 4), [1.7399, -1.6476])
        assert_equal(np.round(lom['pval'], 4), [0.0819, 0.0994])
        assert_equal(np.round(lom['CI[2.5%]'], 4), [-.1669, -.4367])
        assert_equal(np.round(lom['CI[97.5%]'], 4), [2.8050, 0.0378])

        # Multiple predictors
        X = df[['X', 'M']].values
        y = df['Ybin'].values
        lom = logistic_regression(X, y).round(4)  # Pingouin
        # Compare against R
        # summary(glm(Ybin ~ X+M, data=df, family=binomial))
        assert_equal(lom['coef'].values, [1.3276, -0.1960, -0.0060])
        assert_equal(lom['se'].values, [0.7784, 0.1408, 0.1253])
        assert_equal(lom['z'].values, [1.7056, -1.3926, -0.0476])
        assert_equal(lom['pval'].values, [0.0881, 0.1637, 0.9620])
        assert_equal(lom['CI[2.5%]'].values, [-.1980, -.4719, -.2516])
        assert_equal(lom['CI[97.5%]'].values, [2.8531, 0.0799, 0.2397])

        # Test other arguments
        c = logistic_regression(df[['X', 'M']], df['Ybin'], coef_only=True)
        assert_equal(np.round(c, 4), [1.3276, -0.1960, -0.0060])

        # With missing values
        logistic_regression(df_nan[['X', 'M']], df_nan['Ybin'], remove_na=True)

        # Test **kwargs
        logistic_regression(X,
                            y,
                            solver='sag',
                            C=10,
                            max_iter=10000,
                            penalty="l2")
        logistic_regression(X, y, solver='sag', multi_class='auto')

        # Test regularization coefficients are strictly closer to 0 than
        # unregularized
        c = logistic_regression(df['X'], df['Ybin'], coef_only=True)
        c_reg = logistic_regression(df['X'],
                                    df['Ybin'],
                                    coef_only=True,
                                    penalty='l2')
        assert all(np.abs(c - 0) > np.abs(c_reg - 0))

        # With one column that has only one unique value
        c = logistic_regression(df[['One', 'X']], df['Ybin'])
        assert np.array_equal(c.loc[:, 'names'], ['Intercept', 'X'])
        c = logistic_regression(df[['X', 'One', 'M', 'Zero']], df['Ybin'])
        assert np.array_equal(c.loc[:, 'names'], ['Intercept', 'X', 'M'])

        # With duplicate columns
        c = logistic_regression(df[['X', 'M', 'X']], df['Ybin'])
        assert np.array_equal(c.loc[:, 'names'], ['Intercept', 'X', 'M'])
        c = logistic_regression(df[['X', 'X', 'X']], df['Ybin'])
        assert np.array_equal(c.loc[:, 'names'], ['Intercept', 'X'])

        # Error: dependent variable is not binary
        with pytest.raises(ValueError):
            y[3] = 2
            logistic_regression(X, y)
Ejemplo n.º 3
0
    def test_logistic_regression(self):
        """Test function logistic_regression."""
        # Simple regression
        lom = logistic_regression(df['X'], df['Ybin'], as_dataframe=False)
        # Compare to R
        # Reproduce in jupyter notebook with rpy2 using
        # %load_ext rpy2.ipython (in a separate cell)
        # Together in one cell below
        # %%R -i df
        # summary(glm(Ybin ~ X, data=df, family=binomial))
        assert_equal(np.round(lom['coef'], 4), [1.3191, -0.1995])
        assert_equal(np.round(lom['se'], 4), [0.7582, 0.1211])
        assert_equal(np.round(lom['z'], 4), [1.7399, -1.6476])
        assert_equal(np.round(lom['pval'], 4), [0.0819, 0.0994])
        assert_equal(np.round(lom['CI[2.5%]'], 4), [-.1669, -.4367])
        assert_equal(np.round(lom['CI[97.5%]'], 4), [2.8050, 0.0378])

        # Multiple predictors
        X = df[['X', 'M']].to_numpy()
        y = df['Ybin'].to_numpy()
        lom = logistic_regression(X, y).round(4)  # Pingouin
        # Compare against R
        # summary(glm(Ybin ~ X+M, data=df, family=binomial))
        assert_equal(lom['coef'].to_numpy(), [1.3275, -0.1960, -0.0060])
        assert_equal(lom['se'].to_numpy(), [0.7784, 0.1408, 0.1253])
        assert_equal(lom['z'].to_numpy(), [1.7055, -1.3926, -0.0475])
        assert_equal(lom['pval'].to_numpy(), [0.0881, 0.1637, 0.9621])
        assert_equal(lom['CI[2.5%]'].to_numpy(), [-.1981, -.4719, -.2516])
        assert_equal(lom['CI[97.5%]'].to_numpy(), [2.8531, 0.0799, 0.2397])

        # Test other arguments
        c = logistic_regression(df[['X', 'M']], df['Ybin'], coef_only=True)
        assert_equal(np.round(c, 4), [1.3275, -0.1960, -0.0060])

        # With missing values
        logistic_regression(df_nan[['X', 'M']], df_nan['Ybin'], remove_na=True)

        # Test **kwargs
        logistic_regression(X, y, solver='sag', C=10, max_iter=10000,
                            penalty="l2")
        logistic_regression(X, y, solver='sag', multi_class='auto')

        # Test regularization coefficients are strictly closer to 0 than
        # unregularized
        c = logistic_regression(df['X'], df['Ybin'], coef_only=True)
        c_reg = logistic_regression(df['X'], df['Ybin'], coef_only=True,
                                    penalty='l2')
        assert all(np.abs(c - 0) > np.abs(c_reg - 0))

        # With one column that has only one unique value
        c = logistic_regression(df[['One', 'X']], df['Ybin'])
        assert_equal(c.loc[:, 'names'].to_numpy(), ['Intercept', 'X'])
        c = logistic_regression(df[['X', 'One', 'M', 'Zero']], df['Ybin'])
        assert_equal(c.loc[:, 'names'].to_numpy(), ['Intercept', 'X', 'M'])

        # With duplicate columns
        c = logistic_regression(df[['X', 'M', 'X']], df['Ybin'])
        assert_equal(c.loc[:, 'names'].to_numpy(), ['Intercept', 'X', 'M'])
        c = logistic_regression(df[['X', 'X', 'X']], df['Ybin'])
        assert_equal(c.loc[:, 'names'].to_numpy(), ['Intercept', 'X'])

        # Error: dependent variable is not binary
        with pytest.raises(ValueError):
            y[3] = 2
            logistic_regression(X, y)

        # --------------------------------------------------------------------
        # 2ND dataset (Penguin)-- compare to R
        lom = logistic_regression(data['body_mass_g'], data['male'],
                                  as_dataframe=False)
        assert np.allclose(lom['coef'], [-5.162541644, 0.001239819])
        assert_equal(np.round(lom['se'], 5), [0.72439, 0.00017])
        assert_equal(np.round(lom['z'], 3), [-7.127, 7.177])
        assert np.allclose(lom['pval'], [1.03e-12, 7.10e-13])
        assert_equal(np.round(lom['CI[2.5%]'], 3), [-6.582, 0.001])
        assert_equal(np.round(lom['CI[97.5%]'], 3), [-3.743, 0.002])

        # With a different scaling: z / p-values should be similar
        lom = logistic_regression(data['body_mass_kg'], data['male'],
                                  as_dataframe=False)
        assert np.allclose(lom['coef'], [-5.162542, 1.239819])
        assert_equal(np.round(lom['se'], 4), [0.7244, 0.1727])
        assert_equal(np.round(lom['z'], 3), [-7.127, 7.177])
        assert np.allclose(lom['pval'], [1.03e-12, 7.10e-13])
        assert_equal(np.round(lom['CI[2.5%]'], 3), [-6.582, 0.901])
        assert_equal(np.round(lom['CI[97.5%]'], 3), [-3.743, 1.578])

        # With no intercept
        lom = logistic_regression(data['body_mass_kg'], data['male'],
                                  as_dataframe=False, fit_intercept=False)
        assert np.isclose(lom['coef'], 0.04150582)
        assert np.round(lom['se'], 5) == 0.02570
        assert np.round(lom['z'], 3) == 1.615
        assert np.round(lom['pval'], 3) == 0.106
        assert np.round(lom['CI[2.5%]'], 3) == -0.009
        assert np.round(lom['CI[97.5%]'], 3) == 0.092

        # With categorical predictors
        # R: >>> glm("male ~ body_mass_kg + species", family=binomial, ...)
        #    >>> confint.default(model)  # Wald CI
        # See https://stats.stackexchange.com/a/275421/253579
        data_dum = pd.get_dummies(data, columns=['species'], drop_first=True)
        X = data_dum[['body_mass_kg', 'species_Chinstrap', 'species_Gentoo']]
        y = data_dum['male']
        lom = logistic_regression(X, y, as_dataframe=False)
        assert_equal(np.round(lom['coef'], 7),
                     [-27.1318593, 7.3728436, -0.2559251, -10.1778083])
        assert_equal(np.round(lom['se'], 4),
                     [2.9984, 0.8141, 0.4293, 1.1946])
        assert_equal(np.round(lom['z'], 3),
                     [-9.049, 9.056, -0.596, -8.520])
        assert_equal(np.round(lom['CI[2.5%]'], 3),
                     [-33.009, 5.777, -1.097, -12.519])
        assert_equal(np.round(lom['CI[97.5%]'], 3),
                     [-21.255, 8.969, 0.586, -7.836])
Ejemplo n.º 4
0
    def test_logistic_regression(self):
        """Test function logistic_regression."""

        # Simple regression
        lom = logistic_regression(df['X'], df['Ybin'], as_dataframe=False)
        # Compare to JASP
        assert_equal(np.round(lom['coef'], 1), [1.3, -0.2])
        assert_equal(np.round(lom['se'], 2), [0.76, 0.12])
        assert_equal(np.round(lom['z'], 1), [1.7, -1.6])
        assert_equal(np.round(lom['pval'], 1), [0.1, 0.1])
        assert_equal(np.round(lom['CI[2.5%]'], 1), [-.2, -.4])
        assert_equal(np.round(lom['CI[97.5%]'], 1), [2.8, 0.0])

        # Multiple predictors
        X = df[['X', 'M']].values
        y = df['Ybin'].values
        lom = logistic_regression(X, y)  # Pingouin
        # Compare against JASP
        assert_equal(np.round(lom['coef'].values, 1), [1.3, -0.2, -0.0])
        assert_equal(np.round(lom['se'].values, 2), [0.78, 0.14, 0.13])
        assert_equal(np.round(lom['z'].values, 1), [1.7, -1.4, -0.1])
        assert_equal(np.round(lom['pval'].values, 1), [0.1, 0.2, 1.])
        assert_equal(np.round(lom['CI[2.5%]'].values, 1), [-.2, -.5, -.3])
        assert_equal(np.round(lom['CI[97.5%]'].values, 1), [2.8, 0.1, 0.2])

        # Test other arguments
        c = logistic_regression(df[['X', 'M']], df['Ybin'], coef_only=True)
        assert_equal(np.round(c, 1), [1.3, -0.2, -0.0])

        # With missing values
        logistic_regression(df_nan[['X', 'M']], df_nan['Ybin'], remove_na=True)

        # Test **kwargs
        logistic_regression(X, y, solver='sag', C=10, max_iter=10000)
        logistic_regression(X, y, solver='sag', multi_class='auto')

        # With one column that has only one unique value
        c = logistic_regression(df[['One', 'X']], df['Ybin'])
        assert np.array_equal(c.loc[:, 'names'], ['Intercept', 'X'])
        c = logistic_regression(df[['X', 'One', 'M', 'Zero']], df['Ybin'])
        assert np.array_equal(c.loc[:, 'names'], ['Intercept', 'X', 'M'])

        # With duplicate columns
        c = logistic_regression(df[['X', 'M', 'X']], df['Ybin'])
        assert np.array_equal(c.loc[:, 'names'], ['Intercept', 'X', 'M'])
        c = logistic_regression(df[['X', 'X', 'X']], df['Ybin'])
        assert np.array_equal(c.loc[:, 'names'], ['Intercept', 'X'])

        # Error: dependent variable is not binary
        with pytest.raises(ValueError):
            y[3] = 2
            logistic_regression(X, y)