Ejemplo n.º 1
0
    def test_compare_OLS(self):
        """
        Gaussian GEE with independence correlation should agree
        exactly with OLS for parameter estimates and standard errors
        derived from the naive covariance estimate.
        """

        vs = Independence()
        family = Gaussian()

        Y = np.random.normal(size=100)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.kron(lrange(20), np.ones(5))

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3",
                              groups,
                              D,
                              family=family,
                              cov_struct=vs)
        mdf = md.fit()

        ols = sm.ols("Y ~ X1 + X2 + X3", data=D).fit()

        assert_almost_equal(ols.params.values, mdf.params, decimal=10)

        se = mdf.standard_errors(covariance_type="naive")
        assert_almost_equal(ols.bse, se, decimal=10)

        naive_tvalues = mdf.params / \
            np.sqrt(np.diag(mdf.naive_covariance))
        assert_almost_equal(naive_tvalues, ols.tvalues, decimal=10)
Ejemplo n.º 2
0
    def test_default_time(self):
        """
        Check that the time defaults work correctly.
        """

        endog, exog, group = load_data("gee_logistic_1.csv")

        # Time values for the autoregressive model
        T = np.zeros(len(endog))
        idx = set(group)
        for ii in idx:
            jj = np.flatnonzero(group == ii)
            T[jj] = lrange(len(jj))

        family = Binomial()
        va = Autoregressive()

        md1 = GEE(endog, exog, group, family=family, cov_struct=va)
        mdf1 = md1.fit()

        md2 = GEE(endog, exog, group, time=T, family=family, cov_struct=va)
        mdf2 = md2.fit()

        assert_almost_equal(mdf1.params, mdf2.params, decimal=6)
        assert_almost_equal(mdf1.standard_errors(),
                            mdf2.standard_errors(),
                            decimal=6)
Ejemplo n.º 3
0
    def test_compare_OLS(self):
        #Gaussian GEE with independence correlation should agree
        #exactly with OLS for parameter estimates and standard errors
        #derived from the naive covariance estimate.

        vs = Independence()
        family = Gaussian()

        Y = np.random.normal(size=100)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.kron(lrange(20), np.ones(5))

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})

        md = GEE.from_formula("Y ~ X1 + X2 + X3", groups, D,
                              family=family, cov_struct=vs)
        mdf = md.fit()

        ols = smf.ols("Y ~ X1 + X2 + X3", data=D).fit()

        # don't use wrapper, asserts_xxx don't work
        ols = ols._results

        assert_almost_equal(ols.params, mdf.params, decimal=10)

        se = mdf.standard_errors(cov_type="naive")
        assert_almost_equal(ols.bse, se, decimal=10)

        naive_tvalues = mdf.params / \
            np.sqrt(np.diag(mdf.cov_naive))
        assert_almost_equal(naive_tvalues, ols.tvalues, decimal=10)
Ejemplo n.º 4
0
    def test_missing_formula(self):
        # Test missing data handling for formulas.

        endog = np.random.normal(size=100)
        exog1 = np.random.normal(size=100)
        exog2 = np.random.normal(size=100)
        exog3 = np.random.normal(size=100)
        groups = np.kron(lrange(20), np.ones(5))

        endog[0] = np.nan
        endog[5:7] = np.nan
        exog2[10:12] = np.nan

        data = pd.DataFrame({"endog": endog, "exog1": exog1, "exog2": exog2,
                             "exog3": exog3, "groups": groups})

        mod1 = GEE.from_formula("endog ~ exog1 + exog2 + exog3",
                                groups, data, missing='drop')
        rslt1 = mod1.fit()

        assert_almost_equal(len(mod1.endog), 95)
        assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 4])

        data = data.dropna()
        groups = groups[data.index.values]

        mod2 = GEE.from_formula("endog ~ exog1 + exog2 + exog3",
                                groups, data, missing='none')
        rslt2 = mod2.fit()

        assert_almost_equal(rslt1.params.values, rslt2.params.values)
        assert_almost_equal(rslt1.bse.values, rslt2.bse.values)
Ejemplo n.º 5
0
    def test_default_time(self):
        # Check that the time defaults work correctly.

        endog,exog,group = load_data("gee_logistic_1.csv")

        # Time values for the autoregressive model
        T = np.zeros(len(endog))
        idx = set(group)
        for ii in idx:
            jj = np.flatnonzero(group == ii)
            T[jj] = lrange(len(jj))

        family = Binomial()
        va = Autoregressive()


        md1 = GEE(endog, exog, group, family=family, cov_struct=va)
        mdf1 = md1.fit()

        md2 = GEE(endog, exog, group, time=T, family=family,
                  cov_struct=va)
        mdf2 = md2.fit()

        assert_almost_equal(mdf1.params, mdf2.params, decimal=6)
        assert_almost_equal(mdf1.standard_errors(),
                            mdf2.standard_errors(), decimal=6)
Ejemplo n.º 6
0
    def t_est_missing(self):

        Y = np.random.normal(size=100)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.kron(lrange(20), np.ones(5))

        Y[0] = np.nan
        Y[5:7] = np.nan
        X2[10:12] = np.nan

        D = pd.DataFrame({
            "Y": Y,
            "X1": X1,
            "X2": X2,
            "X3": X3,
            "groups": groups
        })

        md = GEE.from_formula("Y ~ X1 + X2 + X3",
                              D,
                              None,
                              groups=D["groups"],
                              missing='drop')
        mdf = md.fit()

        assert (len(md.endog) == 95)
        assert (md.exog.shape) == (95, 4)
Ejemplo n.º 7
0
    def test_missing(self):
        #Test missing data handling for calling from the api.  Missing
        #data handling does not currently work for formulas.

        endog = np.random.normal(size=100)
        exog = np.random.normal(size=(100, 3))
        exog[:, 0] = 1
        groups = np.kron(lrange(20), np.ones(5))

        endog[0] = np.nan
        endog[5:7] = np.nan
        exog[10:12, 1] = np.nan

        mod1 = GEE(endog, exog, groups, missing='drop')
        rslt1 = mod1.fit()

        assert_almost_equal(len(mod1.endog), 95)
        assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 3])

        ii = np.isfinite(endog) & np.isfinite(exog).all(1)

        mod2 = GEE(endog[ii], exog[ii, :], groups[ii], missing='none')
        rslt2 = mod2.fit()

        assert_almost_equal(rslt1.params, rslt2.params)
        assert_almost_equal(rslt1.bse, rslt2.bse)
Ejemplo n.º 8
0
    def test_missing_formula(self):
        # Test missing data handling for formulas.

        endog = np.random.normal(size=100)
        exog1 = np.random.normal(size=100)
        exog2 = np.random.normal(size=100)
        exog3 = np.random.normal(size=100)
        groups = np.kron(lrange(20), np.ones(5))

        endog[0] = np.nan
        endog[5:7] = np.nan
        exog2[10:12] = np.nan

        data = pd.DataFrame({"endog": endog, "exog1": exog1, "exog2": exog2,
                             "exog3": exog3, "groups": groups})

        mod1 = GEE.from_formula("endog ~ exog1 + exog2 + exog3",
                                groups, data, missing='drop')
        rslt1 = mod1.fit()

        assert_almost_equal(len(mod1.endog), 95)
        assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 4])

        data = data.dropna()
        groups = groups[data.index.values]

        mod2 = GEE.from_formula("endog ~ exog1 + exog2 + exog3",
                                groups, data, missing='none')
        rslt2 = mod2.fit()

        assert_almost_equal(rslt1.params.values, rslt2.params.values)
        assert_almost_equal(rslt1.bse.values, rslt2.bse.values)
Ejemplo n.º 9
0
    def test_missing(self):
        """
        Test missing data handling for calling from the api.  Missing
        data handling does not currently work for formulas.
        """

        endog = np.random.normal(size=100)
        exog = np.random.normal(size=(100, 3))
        exog[:, 0] = 1
        groups = np.kron(lrange(20), np.ones(5))

        endog[0] = np.nan
        endog[5:7] = np.nan
        exog[10:12, 1] = np.nan

        mod1 = GEE(endog, exog, groups, missing='drop')
        rslt1 = mod1.fit()

        assert_almost_equal(len(mod1.endog), 95)
        assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 3])

        ii = np.isfinite(endog) & np.isfinite(exog).all(1)

        mod2 = GEE(endog[ii], exog[ii, :], groups[ii], missing='none')
        rslt2 = mod2.fit()

        assert_almost_equal(rslt1.params, rslt2.params)
        assert_almost_equal(rslt1.bse, rslt2.bse)
Ejemplo n.º 10
0
    def test_formulas(self):
        """
        Check formulas, especially passing groups and time as either
        variable names or arrays.
        """

        n = 100
        Y = np.random.normal(size=n)
        X1 = np.random.normal(size=n)
        mat = np.concatenate((np.ones((n, 1)), X1[:, None]), axis=1)
        Time = np.random.uniform(size=n)
        groups = np.kron(lrange(20), np.ones(5))

        data = pd.DataFrame({"Y": Y, "X1": X1, "Time": Time, "groups": groups})

        va = Autoregressive()
        family = Gaussian()

        mod1 = GEE(Y, mat, groups, time=Time, family=family, cov_struct=va)
        rslt1 = mod1.fit()

        mod2 = GEE.from_formula("Y ~ X1",
                                groups,
                                data,
                                time=Time,
                                family=family,
                                cov_struct=va)
        rslt2 = mod2.fit()

        mod3 = GEE.from_formula("Y ~ X1",
                                groups,
                                data,
                                time="Time",
                                family=family,
                                cov_struct=va)
        rslt3 = mod3.fit()

        mod4 = GEE.from_formula("Y ~ X1",
                                "groups",
                                data,
                                time=Time,
                                family=family,
                                cov_struct=va)
        rslt4 = mod4.fit()

        mod5 = GEE.from_formula("Y ~ X1",
                                "groups",
                                data,
                                time="Time",
                                family=family,
                                cov_struct=va)
        rslt5 = mod5.fit()

        assert_almost_equal(rslt1.params, rslt2.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt3.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt4.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt5.params, decimal=8)

        check_wrapper(rslt2)
Ejemplo n.º 11
0
def test_combinations():
    actual = list(combinations('ABCD', 2))
    desired = [('A', 'B'), ('A', 'C'), ('A', 'D'), ('B', 'C'), ('B', 'D'),
               ('C', 'D')]
    assert_(actual == desired, '%r not equal %r' % (actual, desired))

    actual = list(combinations(lrange(4), 3))
    desired = [(0, 1, 2), (0, 1, 3), (0, 2, 3), (1, 2, 3)]
    assert_(actual == desired, '%r not equal %r' % (actual, desired))
Ejemplo n.º 12
0
def test_combinations():
    actual = list(combinations('ABCD', 2))
    desired = [('A', 'B'), ('A', 'C'), ('A', 'D'), ('B', 'C'), ('B', 'D'),
               ('C', 'D')]
    assert_(actual == desired, '%r not equal %r' % (actual, desired))

    actual = list(combinations(lrange(4), 3))
    desired = [(0, 1, 2), (0, 1, 3), (0, 2, 3), (1, 2, 3)]
    assert_(actual == desired, '%r not equal %r' % (actual, desired))
Ejemplo n.º 13
0
def test_combinations():
    actual = list(combinations("ABCD", 2))
    desired = [
        ("A", "B"),
        ("A", "C"),
        ("A", "D"),
        ("B", "C"),
        ("B", "D"),
        ("C", "D"),
    ]
    assert_(actual == desired, "%r not equal %r" % (actual, desired))

    actual = list(combinations(lrange(4), 3))
    desired = [(0, 1, 2), (0, 1, 3), (0, 2, 3), (1, 2, 3)]
    assert_(actual == desired, "%r not equal %r" % (actual, desired))
Ejemplo n.º 14
0
    def test_formulas(self):
        """
        Check formulas, especially passing groups and time as either
        variable names or arrays.
        """

        n = 100
        Y = np.random.normal(size=n)
        X1 = np.random.normal(size=n)
        mat = np.concatenate((np.ones((n,1)), X1[:, None]), axis=1)
        Time = np.random.uniform(size=n)
        groups = np.kron(lrange(20), np.ones(5))

        data = pd.DataFrame({"Y": Y, "X1": X1, "Time": Time, "groups": groups})

        va = Autoregressive()
        family = Gaussian()

        mod1 = GEE(Y, mat, groups, time=Time, family=family,
                   cov_struct=va)
        rslt1 = mod1.fit()

        mod2 = GEE.from_formula("Y ~ X1", groups, data, time=Time,
                                family=family, cov_struct=va)
        rslt2 = mod2.fit()

        mod3 = GEE.from_formula("Y ~ X1", groups, data, time="Time",
                                family=family, cov_struct=va)
        rslt3 = mod3.fit()

        mod4 = GEE.from_formula("Y ~ X1", "groups", data, time=Time,
                                family=family, cov_struct=va)
        rslt4 = mod4.fit()

        mod5 = GEE.from_formula("Y ~ X1", "groups", data, time="Time",
                                family=family, cov_struct=va)
        rslt5 = mod5.fit()

        assert_almost_equal(rslt1.params, rslt2.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt3.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt4.params, decimal=8)
        assert_almost_equal(rslt1.params, rslt5.params, decimal=8)

        check_wrapper(rslt2)
Ejemplo n.º 15
0
    def t_est_missing(self):

        Y = np.random.normal(size=100)
        X1 = np.random.normal(size=100)
        X2 = np.random.normal(size=100)
        X3 = np.random.normal(size=100)
        groups = np.kron(lrange(20), np.ones(5))

        Y[0] = np.nan
        Y[5:7] = np.nan
        X2[10:12] = np.nan

        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3,
                          "groups": groups})

        md = GEE.from_formula("Y ~ X1 + X2 + X3", D, None,
                              groups=D["groups"], missing='drop')
        mdf = md.fit()

        assert(len(md.endog) == 95)
        assert(md.exog.shape) == (95,4)
Ejemplo n.º 16
0
    def test_logistic(self):
        #R code for comparing results:

        #library(gee)
        #Z = read.csv("results/gee_logistic_1.csv", header=FALSE)
        #Y = Z[,2]
        #Id = Z[,1]
        #X1 = Z[,3]
        #X2 = Z[,4]
        #X3 = Z[,5]

        #mi = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
        #         corstr="independence")
        #smi = summary(mi)
        #u = coefficients(smi)
        #cfi = paste(u[,1], collapse=",")
        #sei = paste(u[,4], collapse=",")

        #me = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
        #         corstr="exchangeable")
        #sme = summary(me)
        #u = coefficients(sme)
        #cfe = paste(u[,1], collapse=",")
        #see = paste(u[,4], collapse=",")

        #ma = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
        #         corstr="AR-M")
        #sma = summary(ma)
        #u = coefficients(sma)
        #cfa = paste(u[,1], collapse=",")
        #sea = paste(u[,4], collapse=",")

        #sprintf("cf = [[%s],[%s],[%s]]", cfi, cfe, cfa)
        #sprintf("se = [[%s],[%s],[%s]]", sei, see, sea)

        endog,exog,group = load_data("gee_logistic_1.csv")

        # Time values for the autoregressive model
        T = np.zeros(len(endog))
        idx = set(group)
        for ii in idx:
            jj = np.flatnonzero(group == ii)
            T[jj] = lrange(len(jj))

        family = Binomial()
        ve = Exchangeable()
        vi = Independence()
        va = Autoregressive()

        # From R gee
        cf = [[0.0167272965285882,1.13038654425893,
               -1.86896345082962,1.09397608331333],
              [0.0178982283915449,1.13118798191788,
               -1.86133518416017,1.08944256230299],
              [0.0109621937947958,1.13226505028438,
               -1.88278757333046,1.09954623769449]]
        se = [[0.127291720283049,0.166725808326067,
               0.192430061340865,0.173141068839597],
              [0.127045031730155,0.165470678232842,
               0.192052750030501,0.173174779369249],
              [0.127240302296444,0.170554083928117,
               0.191045527104503,0.169776150974586]]

        for j,v in enumerate((vi,ve,va)):
            md = GEE(endog, exog, group, T, family, v)
            mdf = md.fit()
            if id(v) != id(va):
                assert_almost_equal(mdf.params, cf[j], decimal=6)
                assert_almost_equal(mdf.standard_errors(), se[j],
                                    decimal=6)

        # Test with formulas
        D = np.concatenate((endog[:,None], group[:,None], exog[:,1:]),
                           axis=1)
        D = pd.DataFrame(D)
        D.columns = ["Y","Id",] + ["X%d" % (k+1)
                                   for k in range(exog.shape[1]-1)]
        for j,v in enumerate((vi,ve)):
             md = GEE.from_formula("Y ~ X1 + X2 + X3", "Id", D,
                                   family=family, cov_struct=v)
             mdf = md.fit()
             assert_almost_equal(mdf.params, cf[j], decimal=6)
             assert_almost_equal(mdf.standard_errors(), se[j],
                                 decimal=6)
Ejemplo n.º 17
0
    def test_logistic(self):
        """
        R code for comparing results:

        library(gee)
        Z = read.csv("results/gee_logistic_1.csv", header=FALSE)
        Y = Z[,2]
        Id = Z[,1]
        X1 = Z[,3]
        X2 = Z[,4]
        X3 = Z[,5]

        mi = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
                 corstr="independence")
        smi = summary(mi)
        u = coefficients(smi)
        cfi = paste(u[,1], collapse=",")
        sei = paste(u[,4], collapse=",")

        me = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
                 corstr="exchangeable")
        sme = summary(me)
        u = coefficients(sme)
        cfe = paste(u[,1], collapse=",")
        see = paste(u[,4], collapse=",")

        ma = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
                 corstr="AR-M")
        sma = summary(ma)
        u = coefficients(sma)
        cfa = paste(u[,1], collapse=",")
        sea = paste(u[,4], collapse=",")

        sprintf("cf = [[%s],[%s],[%s]]", cfi, cfe, cfa)
        sprintf("se = [[%s],[%s],[%s]]", sei, see, sea)
        """

        endog, exog, group = load_data("gee_logistic_1.csv")

        # Time values for the autoregressive model
        T = np.zeros(len(endog))
        idx = set(group)
        for ii in idx:
            jj = np.flatnonzero(group == ii)
            T[jj] = lrange(len(jj))

        family = Binomial()
        ve = Exchangeable()
        vi = Independence()
        va = Autoregressive()

        # From R gee
        cf = [[
            0.0167272965285882, 1.13038654425893, -1.86896345082962,
            1.09397608331333
        ],
              [
                  0.0178982283915449, 1.13118798191788, -1.86133518416017,
                  1.08944256230299
              ],
              [
                  0.0109621937947958, 1.13226505028438, -1.88278757333046,
                  1.09954623769449
              ]]
        se = [[
            0.127291720283049, 0.166725808326067, 0.192430061340865,
            0.173141068839597
        ],
              [
                  0.127045031730155, 0.165470678232842, 0.192052750030501,
                  0.173174779369249
              ],
              [
                  0.127240302296444, 0.170554083928117, 0.191045527104503,
                  0.169776150974586
              ]]

        for j, v in enumerate((vi, ve, va)):
            md = GEE(endog, exog, group, T, family, v)
            mdf = md.fit()
            if id(v) != id(va):
                assert_almost_equal(mdf.params, cf[j], decimal=6)
                assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)

        # Test with formulas
        D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]),
                           axis=1)
        D = pd.DataFrame(D)
        D.columns = [
            "Y",
            "Id",
        ] + ["X%d" % (k + 1) for k in range(exog.shape[1] - 1)]
        for j, v in enumerate((vi, ve)):
            md = GEE.from_formula("Y ~ X1 + X2 + X3",
                                  D,
                                  None,
                                  groups=D.loc[:, "Id"],
                                  family=family,
                                  cov_struct=v)
            mdf = md.fit()
            assert_almost_equal(mdf.params, cf[j], decimal=6)
            assert_almost_equal(mdf.standard_errors(), se[j], decimal=6)
Ejemplo n.º 18
0
def ols_high_d_category_multi_results(data_df, models, table_header):
    """
    This function is used to get multi results of multi models on one dataframe. During analyzing data with large data
    size and complicated, we usually have several model assumptions. By using this function, we can easily get the
    results comparison of the different models.

    :param data_df: Dataframe with relevant data
    :param models: List of models
    :param table_header: Title of summary table
    :return: summary table of results of the different models
    """
    results = []
    for model1 in models:
        results.append(
            ols_high_d_category(data_df,
                                model1['consist_input'],
                                model1['out_input'],
                                model1['category_input'],
                                model1['cluster_input'],
                                formula=None,
                                robust=False,
                                c_method='cgm',
                                epsilon=1e-5,
                                max_iter=1e6))
    consist_name_list = [result.params.index.to_list() for result in results]
    consist_name_total = []
    consist_name_total.extend(consist_name_list[0])
    for i in consist_name_list[1:]:
        for j in i:
            if j not in consist_name_total:
                consist_name_total.append(j)
    index_name = []
    for name in consist_name_total:
        index_name.append(name)
        index_name.append('pvalue')
        index_name.append('std err')
    exog_len = lrange(len(results))
    lzip = []
    y_zip = []
    b_zip = np.zeros(5)
    table_content = []
    for name in consist_name_total:
        coeff_list = []
        pvalue_list = []
        std_list = []
        for i in range(len(results)):
            if name in consist_name_list[i]:
                coeff = "%#7.4g" % (results[i].params[name])
                pvalue = "%#8.2g" % (results[i].pvalues[name])
                std = "%#8.2f" % (
                    results[i].bse[consist_name_list[i].index(name)])
                coeff_list.append(coeff)
                pvalue_list.append(pvalue)
                std_list.append(std)
            else:
                coeff = 'Nan'
                pvalue = 'Nan'
                std = 'Nan'
                coeff_list.append(coeff)
                pvalue_list.append(pvalue)
                std_list.append(std)
        table_content.append(tuple(coeff_list))
        table_content.append(tuple(pvalue_list))
        table_content.append(tuple(std_list))
    wtffff = dict(
        fmt='txt',
        # basic table formatting
        table_dec_above='=',
        table_dec_below='-',
        title_align='l',
        # basic row formatting
        row_pre='',
        row_post='',
        header_dec_below='-',
        row_dec_below=None,
        colwidths=None,
        colsep=' ',
        data_aligns="l",
        # data formats
        # data_fmt="%s",
        data_fmts=["%s"],
        # labeled alignments
        # stubs_align='l',
        stub_align='l',
        header_align='r',
        # labeled formats
        header_fmt='%s',
        stub_fmt='%s',
        header='%s',
        stub='%s',
        empty_cell='',
        empty='',
        missing='--',
    )
    a = SimpleTable(table_content,
                    table_header,
                    index_name,
                    title='multi',
                    txt_fmt=wtffff)
    print(a)