Python MI Examples, statsmodels.imputation.bayes_mi.MI Python Examples

Example #1

0

Show file

def test_MI():

    np.random.seed(414)
    x = np.random.normal(size=(200, 4))
    x[[1, 3, 9], 0] = np.nan
    x[[1, 4, 3], 1] = np.nan
    x[[2, 11, 21], 2] = np.nan
    x[[11, 22, 99], 3] = np.nan

    def model_args_fn(x):
        # Return endog, exog
        # Regress x0 on x1 and x2
        if type(x) is np.ndarray:
            return (x[:, 0], x[:, 1:])
        else:
            return (x.iloc[:, 0].values, x.iloc[:, 1:].values)

    for j in (0, 1):
        np.random.seed(2342)
        imp = BayesGaussMI(x.copy())
        mi = MI(imp, sm.OLS, model_args_fn, burn=0)
        r = mi.fit()
        r.summary()  # smoke test
        # TODO: why does the test tolerance need to be so slack?
        # There is unexpected variation across versions on travis.
        assert_allclose(r.params, np.r_[-0.05347919, -0.02479701, 0.10075517],
                        0.25, 0)

        c = np.asarray([[0.00418232, 0.00029746, -0.00035057],
                        [0.00029746, 0.00407264, 0.00019496],
                        [-0.00035057, 0.00019496, 0.00509413]])
        assert_allclose(r.cov_params(), c, 0.3, 0)

        # Test with ndarray and pandas input
        x = pd.DataFrame(x)

Example #2

0

Show file

def test_mi_formula():

    np.random.seed(414)
    x = np.random.normal(size=(200, 4))
    x[[1, 3, 9], 0] = np.nan
    x[[1, 4, 3], 1] = np.nan
    x[[2, 11, 21], 2] = np.nan
    x[[11, 22, 99], 3] = np.nan
    df = pd.DataFrame({
        "y": x[:, 0],
        "x1": x[:, 1],
        "x2": x[:, 2],
        "x3": x[:, 3]
    })
    fml = "y ~ 0 + x1 + x2 + x3"

    np.random.seed(2342)
    imp = BayesGaussMI(df.copy())
    mi = MI(imp, sm.OLS, formula=fml, burn=0)
    r = mi.fit()
    r.summary()  # smoke test
    # TODO: why does the test tolerance need to be so slack?
    # There is unexpected variation across versions on travis.
    assert_allclose(r.params, np.r_[-0.05347919, -0.02479701, 0.10075517],
                    0.25, 0)

    c = np.asarray([[0.00418232, 0.00029746, -0.00035057],
                    [0.00029746, 0.00407264, 0.00019496],
                    [-0.00035057, 0.00019496, 0.00509413]])
    assert_allclose(r.cov_params(), c, 0.3, 0)

Example #3

0

Show file

File: test_bayes_mi.py Project: haribharadwaj/statsmodels

def test_MI():

    np.random.seed(414)
    x = np.random.normal(size=(200, 4))
    x[[1, 3, 9], 0] = np.nan
    x[[1, 4, 3], 1] = np.nan
    x[[2, 11, 21], 2] = np.nan
    x[[11, 22, 99], 3] = np.nan

    def model_args_fn(x):
        # Return endog, exog
        # Regress x0 on x1 and x2
        if type(x) is np.ndarray:
            return (x[:, 0], x[:, 1:])
        else:
            return (x.iloc[:, 0].values, x.iloc[:, 1:].values)

    for j in (0, 1):
        np.random.seed(2342)
        imp = BayesGaussMI(x.copy())
        mi = MI(imp, sm.OLS, model_args_fn, burn=0)
        r = mi.fit()
        r.summary()  # smoke test
        # TODO: why does the test tolerance need to be so slack?
        # There is unexpected variation across versions on travis.
        assert_allclose(r.params, np.r_[
            -0.05347919, -0.02479701, 0.10075517], 0.25, 0)

        c = np.asarray([[0.00418232, 0.00029746, -0.00035057],
                        [0.00029746, 0.00407264, 0.00019496],
                        [-0.00035057, 0.00019496, 0.00509413]])
        assert_allclose(r.cov_params(), c, 0.3, 0)

        # Test with ndarray and pandas input
        x = pd.DataFrame(x)

Example #4

0

Show file

File: test_bayes_mi.py Project: haribharadwaj/statsmodels

def test_mi_formula():

    np.random.seed(414)
    x = np.random.normal(size=(200, 4))
    x[[1, 3, 9], 0] = np.nan
    x[[1, 4, 3], 1] = np.nan
    x[[2, 11, 21], 2] = np.nan
    x[[11, 22, 99], 3] = np.nan
    df = pd.DataFrame({"y": x[:, 0], "x1": x[:, 1],
                       "x2": x[:, 2], "x3": x[:, 3]})
    fml = "y ~ 0 + x1 + x2 + x3"

    np.random.seed(2342)
    imp = BayesGaussMI(df.copy())
    mi = MI(imp, sm.OLS, formula=fml, burn=0)
    r = mi.fit()
    r.summary()  # smoke test
    # TODO: why does the test tolerance need to be so slack?
    # There is unexpected variation across versions on travis.
    assert_allclose(r.params, np.r_[
            -0.05347919, -0.02479701, 0.10075517], 0.25, 0)

    c = np.asarray([[0.00418232, 0.00029746, -0.00035057],
                    [0.00029746, 0.00407264, 0.00019496],
                    [-0.00035057, 0.00019496, 0.00509413]])
    assert_allclose(r.cov_params(), c, 0.3, 0)

Example #5

0

Show file

 def make_imp(fml):
     imp = MI(
         mimi(impvar, vx, vb, mn, proj, varls, df, log, bp_var, bp_dir),
         sm.OLS,
         formula=fml,
         model_kwds_fn=lambda x: {"data": x},
         burn=0,
         nrep=n_imp,
         skip=0)
     return imp

Example #6

0

Show file

def test_MI_stat():
    # Test for MI where we know statistically what should happen. The
    # analysis model is x0 ~ x1 with standard error 1/sqrt(n) for the
    # slope parameter.  The nominal n is 1000, but half of the cases
    # have missing x1.  Then we introduce x2 that is either
    # independent of x1, or almost perfectly correlated with x1.  In
    # the first case the SE is 1/sqrt(500), in the second case the SE
    # is 1/sqrt(1000).

    np.random.seed(414)
    z = np.random.normal(size=(1000, 3))
    z[:, 0] += 0.5 * z[:, 1]

    # Control the degree to which x2 proxies for x1
    exp = [1 / np.sqrt(500), 1 / np.sqrt(1000)]
    fmi = [0.5, 0]
    for j, r in enumerate((0, 0.9999)):

        x = z.copy()
        x[:, 2] = r * x[:, 1] + np.sqrt(1 - r**2) * x[:, 2]
        x[0:500, 1] = np.nan

        def model_args(x):
            # Return endog, exog
            # Regress x1 on x2
            return (x[:, 0], x[:, 1])

        np.random.seed(2342)
        imp = BayesGaussMI(x.copy())
        mi = MI(imp, sm.OLS, model_args, nrep=100, skip=10)
        r = mi.fit()

        # Check the SE
        d = np.abs(r.bse[0] - exp[j]) / exp[j]
        assert (d < 0.03)

        # Check the FMI
        d = np.abs(r.fmi[0] - fmi[j])
        assert (d < 0.05)

Example #7

0

Show file

File: test_bayes_mi.py Project: haribharadwaj/statsmodels

def test_MI_stat():
    # Test for MI where we know statistically what should happen. The
    # analysis model is x0 ~ x1 with standard error 1/sqrt(n) for the
    # slope parameter.  The nominal n is 1000, but half of the cases
    # have missing x1.  Then we introduce x2 that is either
    # independent of x1, or almost perfectly correlated with x1.  In
    # the first case the SE is 1/sqrt(500), in the second case the SE
    # is 1/sqrt(1000).

    np.random.seed(414)
    z = np.random.normal(size=(1000, 3))
    z[:, 0] += 0.5*z[:, 1]

    # Control the degree to which x2 proxies for x1
    exp = [1/np.sqrt(500), 1/np.sqrt(1000)]
    fmi = [0.5, 0]
    for j, r in enumerate((0, 0.9999)):

        x = z.copy()
        x[:, 2] = r*x[:, 1] + np.sqrt(1 - r**2)*x[:, 2]
        x[0:500, 1] = np.nan

        def model_args(x):
            # Return endog, exog
            # Regress x1 on x2
            return (x[:, 0], x[:, 1])

        np.random.seed(2342)
        imp = BayesGaussMI(x.copy())
        mi = MI(imp, sm.OLS, model_args, nrep=100, skip=10)
        r = mi.fit()

        # Check the SE
        d = np.abs(r.bse[0] - exp[j]) / exp[j]
        assert(d < 0.03)

        # Check the FMI
        d = np.abs(r.fmi[0] - fmi[j])
        assert(d < 0.05)

Example #8

0

Show file

        "groups": "MomIdUnique",
        "data": x,
        "re_formula": "1",
        "vc_formula": vcf
    }


def fit_kwds_fn(x):
    return {"method": "lbfgs", "reml": False}


imp = MI(mimi(impvar, vx, vb, mn, proj, varls, df, log, bp_var, bp_dir),
         sm.MixedLM,
         model_args_fn=None,
         formula=fml,
         model_kwds_fn=model_kwds_fn,
         fit_kwds=fit_kwds_fn,
         burn=0,
         nrep=20,
         skip=0)

if (impvar == "BMI") and (ndim == 1) and controlcbs:

    import json
    f = open("centering.json")
    centering = json.load(f)
    f.close()

    # Create a table 1, based on counting people
    m = mimi(impvar, vx, vb, mn, proj, varls, df, log, bp_var, bp_dir)
    m.update()