def test_repeat_partition():

    # tests that if we use identical partitions the average is the same
    # as the estimate for the full data

    np.random.seed(435265)
    N = 200
    p = 10
    m = 1

    beta = np.random.normal(size=p)
    beta = beta * np.random.randint(0, 2, p)
    X = np.random.normal(size=(N, p))
    y = X.dot(beta) + np.random.normal(size=N)

    def _rep_data_gen(endog, exog, partitions):
        """partitions data"""

        n_exog = exog.shape[0]
        n_part = np.ceil(n_exog / partitions)

        ii = 0
        while ii < n_exog:
            yield endog, exog
            ii += int(n_part)

    nv_mod = DistributedModel(m, estimation_method=_est_regularized_naive,
                              join_method=_join_naive)
    fitOLSnv = nv_mod.fit(_rep_data_gen(y, X, m), fit_kwds={"alpha": 0.1})

    ols_mod = OLS(y, X)
    fitOLS = ols_mod.fit_regularized(alpha=0.1)

    assert_allclose(fitOLSnv.params, fitOLS.params)
def test_repeat_partition():

    # tests that if we use identical partitions the average is the same
    # as the estimate for the full data

    np.random.seed(435265)
    N = 200
    p = 10
    m = 1

    beta = np.random.normal(size=p)
    beta = beta * np.random.randint(0, 2, p)
    X = np.random.normal(size=(N, p))
    y = X.dot(beta) + np.random.normal(size=N)

    def _rep_data_gen(endog, exog, partitions):
        """partitions data"""

        n_exog = exog.shape[0]
        n_part = np.ceil(n_exog / partitions)

        ii = 0
        while ii < n_exog:
            yield endog, exog
            ii += int(n_part)

    nv_mod = DistributedModel(m,
                              estimation_method=_est_regularized_naive,
                              join_method=_join_naive)
    fitOLSnv = nv_mod.fit(_rep_data_gen(y, X, m), fit_kwds={"alpha": 0.1})

    ols_mod = OLS(y, X)
    fitOLS = ols_mod.fit_regularized(alpha=0.1)

    assert_allclose(fitOLSnv.params, fitOLS.params)
Esempio n. 3
0
File: api.py Progetto: wjsi/mars
 def __init__(
     self,
     factor=None,
     num_partitions=None,
     model_class=None,
     init_kwds=None,
     estimation_method=None,
     estimation_kwds=None,
     join_method=None,
     join_kwds=None,
     results_class=None,
     results_kwds=None,
 ):
     self._factor = factor
     self._sm_model = DistributedModel(
         num_partitions or 10,
         model_class=model_class,
         init_kwds=init_kwds,
         estimation_method=estimation_method,
         estimation_kwds=estimation_kwds,
         join_method=join_method,
         join_kwds=join_kwds,
         results_class=results_class,
         results_kwds=results_kwds,
     )
def test_non_zero_params():

    # tests that the thresholding does not cause any issues

    np.random.seed(435265)
    N = 200
    p = 10
    m = 5

    beta = np.random.normal(size=p)
    beta = beta * np.random.randint(0, 2, p)
    X = np.random.normal(size=(N, p))
    y = X.dot(beta) + np.random.normal(size=N)

    db_mod = DistributedModel(m, join_kwds={"threshold": 0.13})
    fitOLSdb = db_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.1})
    ols_mod = OLS(y, X)
    fitOLS = ols_mod.fit_regularized(alpha=0.1)

    nz_params_db = 1 * (fitOLSdb.params != 0)
    nz_params_ols = 1 * (fitOLS.params != 0)

    assert_allclose(nz_params_db, nz_params_ols)
def test_non_zero_params():

    # tests that the thresholding does not cause any issues

    np.random.seed(435265)
    N = 200
    p = 10
    m = 5

    beta = np.random.normal(size=p)
    beta = beta * np.random.randint(0, 2, p)
    X = np.random.normal(size=(N, p))
    y = X.dot(beta) + np.random.normal(size=N)

    db_mod = DistributedModel(m, join_kwds={"threshold": 0.13})
    fitOLSdb = db_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.1})
    ols_mod = OLS(y, X)
    fitOLS = ols_mod.fit_regularized(alpha=0.1)

    nz_params_db = 1 * (fitOLSdb.params != 0)
    nz_params_ols = 1 * (fitOLS.params != 0)

    assert_allclose(nz_params_db, nz_params_ols)
def test_single_partition():

    # tests that the results make sense if we have a single partition

    np.random.seed(435265)
    N = 200
    p = 10
    m = 1

    beta = np.random.normal(size=p)
    beta = beta * np.random.randint(0, 2, p)
    X = np.random.normal(size=(N, p))
    y = X.dot(beta) + np.random.normal(size=N)

    # test regularized OLS v. naive
    db_mod = DistributedModel(m)
    fitOLSdb = db_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0})

    nv_mod = DistributedModel(m,
                              estimation_method=_est_regularized_naive,
                              join_method=_join_naive)
    fitOLSnv = nv_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0})

    ols_mod = OLS(y, X)
    fitOLS = ols_mod.fit(alpha=0)

    assert_allclose(fitOLSdb.params, fitOLS.params)
    assert_allclose(fitOLSnv.params, fitOLS.params)

    # test regularized
    nv_mod = DistributedModel(m,
                              estimation_method=_est_regularized_naive,
                              join_method=_join_naive)
    fitOLSnv = nv_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.1})

    ols_mod = OLS(y, X)
    fitOLS = ols_mod.fit_regularized(alpha=0.1)

    assert_allclose(fitOLSnv.params, fitOLS.params)
def test_debiased_v_average():

    # tests that the debiased method performs better than the standard
    # average.  Does this for both OLS and GLM.

    np.random.seed(435265)
    N = 200
    p = 10
    m = 4

    beta = np.random.normal(size=p)
    beta = beta * np.random.randint(0, 2, p)
    X = np.random.normal(size=(N, p))
    y = X.dot(beta) + np.random.normal(size=N)

    db_mod = DistributedModel(m)
    fitOLSdb = db_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.2})
    olsdb = np.linalg.norm(fitOLSdb.params - beta)
    n_mod = DistributedModel(m, estimation_method=_est_regularized_naive,
                             join_method=_join_naive)
    fitOLSn = n_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.2})
    olsn = np.linalg.norm(fitOLSn.params - beta)

    assert_(olsdb < olsn)

    prob = 1 / (1 + np.exp(-X.dot(beta) + np.random.normal(size=N)))
    y = 1. * (prob > 0.5)

    db_mod = DistributedModel(m, model_class=GLM,
                              init_kwds={"family": Binomial()})
    fitGLMdb = db_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.2})
    glmdb = np.linalg.norm(fitGLMdb.params - beta)
    n_mod = DistributedModel(m, model_class=GLM,
                             init_kwds={"family": Binomial()},
                             estimation_method=_est_regularized_naive,
                             join_method=_join_naive)
    fitGLMn = n_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.2})
    glmn = np.linalg.norm(fitGLMn.params - beta)

    assert_(glmdb < glmn)
Esempio n. 8
0
def test_larger_p():

    # tests when p > N / m for the debiased and naive case

    np.random.seed(435265)
    N = 40
    p = 40
    m = 5

    beta = np.random.normal(size=p)
    beta = beta * np.random.randint(0, 2, p)
    X = np.random.normal(size=(N, p))
    y = X.dot(beta) + np.random.normal(size=N)

    db_mod = DistributedModel(m)
    fitOLSdb = db_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.1})
    assert_equal(np.sum(np.isnan(fitOLSdb.params)), 0)

    nv_mod = DistributedModel(m, estimation_method=_est_regularized_naive,
                              join_method=_join_naive)
    fitOLSnv = nv_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.1})
    assert_equal(np.sum(np.isnan(fitOLSnv.params)), 0)
def test_single_partition():

    # tests that the results make sense if we have a single partition

    np.random.seed(435265)
    N = 200
    p = 10
    m = 1

    beta = np.random.normal(size=p)
    beta = beta * np.random.randint(0, 2, p)
    X = np.random.normal(size=(N, p))
    y = X.dot(beta) + np.random.normal(size=N)

    # test regularized OLS v. naive
    db_mod = DistributedModel(m)
    fitOLSdb = db_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0})

    nv_mod = DistributedModel(m, estimation_method=_est_regularized_naive,
                              join_method=_join_naive)
    fitOLSnv = nv_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0})

    ols_mod = OLS(y, X)
    fitOLS = ols_mod.fit(alpha=0)

    assert_allclose(fitOLSdb.params, fitOLS.params)
    assert_allclose(fitOLSnv.params, fitOLS.params)

    # test regularized
    nv_mod = DistributedModel(m, estimation_method=_est_regularized_naive,
                              join_method=_join_naive)
    fitOLSnv = nv_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.1})

    ols_mod = OLS(y, X)
    fitOLS = ols_mod.fit_regularized(alpha=0.1)

    assert_allclose(fitOLSnv.params, fitOLS.params)
        yield endog[ii:jj]
        ii += int(n_part)


# Next we generate some random data to serve as an example.

X = np.random.normal(size=(1000, 25))
beta = np.random.normal(size=25)
beta *= np.random.randint(0, 2, size=25)
y = norm.rvs(loc=X.dot(beta))
m = 5

# This is the most basic fit, showing all of the defaults, which are to
# use OLS as the model class, and the debiasing procedure.

debiased_OLS_mod = DistributedModel(m)
debiased_OLS_fit = debiased_OLS_mod.fit(zip(_endog_gen(y, m), _exog_gen(X, m)),
                                        fit_kwds={"alpha": 0.2})

# Then we run through a slightly more complicated example which uses the
# GLM model class.

from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod.families import Gaussian

debiased_GLM_mod = DistributedModel(m,
                                    model_class=GLM,
                                    init_kwds={"family": Gaussian()})
debiased_GLM_fit = debiased_GLM_mod.fit(zip(_endog_gen(y, m), _exog_gen(X, m)),
                                        fit_kwds={"alpha": 0.2})
def test_debiased_v_average():

    # tests that the debiased method performs better than the standard
    # average.  Does this for both OLS and GLM.

    np.random.seed(435265)
    N = 200
    p = 10
    m = 4

    beta = np.random.normal(size=p)
    beta = beta * np.random.randint(0, 2, p)
    X = np.random.normal(size=(N, p))
    y = X.dot(beta) + np.random.normal(size=N)

    db_mod = DistributedModel(m)
    fitOLSdb = db_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.2})
    olsdb = np.linalg.norm(fitOLSdb.params - beta)
    n_mod = DistributedModel(m,
                             estimation_method=_est_regularized_naive,
                             join_method=_join_naive)
    fitOLSn = n_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.2})
    olsn = np.linalg.norm(fitOLSn.params - beta)

    assert_(olsdb < olsn)

    prob = 1 / (1 + np.exp(-X.dot(beta) + np.random.normal(size=N)))
    y = 1. * (prob > 0.5)

    db_mod = DistributedModel(m,
                              model_class=GLM,
                              init_kwds={"family": Binomial()})
    fitGLMdb = db_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.2})
    glmdb = np.linalg.norm(fitGLMdb.params - beta)
    n_mod = DistributedModel(m,
                             model_class=GLM,
                             init_kwds={"family": Binomial()},
                             estimation_method=_est_regularized_naive,
                             join_method=_join_naive)
    fitGLMn = n_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.2})
    glmn = np.linalg.norm(fitGLMn.params - beta)

    assert_(glmdb < glmn)
def test_fit_joblib():

    # tests that the results of all the intermediate steps
    # remains correct for joblib fit, does this for OLS and GLM
    # and a variety of model sizes
    #
    # regression test

    np.random.seed(435265)
    X = np.random.normal(size=(50, 3))
    y = np.random.randint(0, 2, size=50)

    mod = DistributedModel(1, model_class=OLS)
    fit = mod.fit(_data_gen(y, X, 1),
                  parallel_method="joblib",
                  fit_kwds={"alpha": 0.5})
    assert_allclose(fit.params,
                    np.array([-0.191606, -0.012565, -0.351398]),
                    atol=1e-6,
                    rtol=0)
    mod = DistributedModel(2, model_class=OLS)
    fit = mod.fit(_data_gen(y, X, 2),
                  parallel_method="joblib",
                  fit_kwds={"alpha": 0.5})
    assert_allclose(fit.params,
                    np.array([-0.157416, -0.029643, -0.471653]),
                    atol=1e-6,
                    rtol=0)
    mod = DistributedModel(3, model_class=OLS)
    fit = mod.fit(_data_gen(y, X, 3),
                  parallel_method="joblib",
                  fit_kwds={"alpha": 0.5})
    assert_allclose(fit.params,
                    np.array([-0.124891, -0.050934, -0.403354]),
                    atol=1e-6,
                    rtol=0)

    mod = DistributedModel(1,
                           model_class=GLM,
                           init_kwds={"family": Binomial()})
    fit = mod.fit(_data_gen(y, X, 1),
                  parallel_method="joblib",
                  fit_kwds={"alpha": 0.5})
    assert_allclose(fit.params,
                    np.array([-0.164515, -0.412854, -0.223955]),
                    atol=1e-6,
                    rtol=0)
    mod = DistributedModel(2,
                           model_class=GLM,
                           init_kwds={"family": Binomial()})
    fit = mod.fit(_data_gen(y, X, 2),
                  parallel_method="joblib",
                  fit_kwds={"alpha": 0.5})
    assert_allclose(fit.params,
                    np.array([-0.142513, -0.360324, -0.295485]),
                    atol=1e-6,
                    rtol=0)
    mod = DistributedModel(3,
                           model_class=GLM,
                           init_kwds={"family": Binomial()})
    fit = mod.fit(_data_gen(y, X, 3),
                  parallel_method="joblib",
                  fit_kwds={"alpha": 0.5})
    assert_allclose(fit.params,
                    np.array([-0.110487, -0.306431, -0.243921]),
                    atol=1e-6,
                    rtol=0)
        yield endog[ii:jj]
        ii += int(n_part)


# Next we generate some random data to serve as an example.

X = np.random.normal(size=(1000, 25))
beta = np.random.normal(size=25)
beta *= np.random.randint(0, 2, size=25)
y = norm.rvs(loc=X.dot(beta))
m = 5

# This is the most basic fit, showing all of the defaults, which are to
# use OLS as the model class, and the debiasing procedure.

debiased_OLS_mod = DistributedModel(m)
debiased_OLS_fit = debiased_OLS_mod.fit(
    zip(_endog_gen(y, m), _exog_gen(X, m)), fit_kwds={"alpha": 0.2})

# Then we run through a slightly more complicated example which uses the
# GLM model class.

from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod.families import Gaussian

debiased_GLM_mod = DistributedModel(
    m, model_class=GLM, init_kwds={"family": Gaussian()})
debiased_GLM_fit = debiased_GLM_mod.fit(
    zip(_endog_gen(y, m), _exog_gen(X, m)), fit_kwds={"alpha": 0.2})

# We can also change the `estimation_method` and the `join_method`.  The
def test_fit_joblib():

    # tests that the results of all the intermediate steps
    # remains correct for joblib fit, does this for OLS and GLM
    # and a variety of model sizes
    #
    # regression test

    np.random.seed(435265)
    X = np.random.normal(size=(50, 3))
    y = np.random.randint(0, 2, size=50)

    mod = DistributedModel(1, model_class=OLS)
    fit = mod.fit(_data_gen(y, X, 1), parallel_method="joblib",
                  fit_kwds={"alpha": 0.5})
    assert_allclose(fit.params, np.array([-0.191606, -0.012565, -0.351398]),
                    atol=1e-6, rtol=0)
    mod = DistributedModel(2, model_class=OLS)
    fit = mod.fit(_data_gen(y, X, 2), parallel_method="joblib",
                  fit_kwds={"alpha": 0.5})
    assert_allclose(fit.params, np.array([-0.157416, -0.029643, -0.471653]),
                    atol=1e-6, rtol=0)
    mod = DistributedModel(3, model_class=OLS)
    fit = mod.fit(_data_gen(y, X, 3), parallel_method="joblib",
                  fit_kwds={"alpha": 0.5})
    assert_allclose(fit.params, np.array([-0.124891, -0.050934, -0.403354]),
                    atol=1e-6, rtol=0)

    mod = DistributedModel(1, model_class=GLM,
                           init_kwds={"family": Binomial()})
    fit = mod.fit(_data_gen(y, X, 1), parallel_method="joblib",
                  fit_kwds={"alpha": 0.5})
    assert_allclose(fit.params, np.array([-0.164515, -0.412854, -0.223955]),
                    atol=1e-6, rtol=0)
    mod = DistributedModel(2, model_class=GLM,
                           init_kwds={"family": Binomial()})
    fit = mod.fit(_data_gen(y, X, 2), parallel_method="joblib",
                  fit_kwds={"alpha": 0.5})
    assert_allclose(fit.params, np.array([-0.142513, -0.360324, -0.295485]),
                    atol=1e-6, rtol=0)
    mod = DistributedModel(3, model_class=GLM,
                           init_kwds={"family": Binomial()})
    fit = mod.fit(_data_gen(y, X, 3), parallel_method="joblib",
                  fit_kwds={"alpha": 0.5})
    assert_allclose(fit.params, np.array([-0.110487, -0.306431, -0.243921]),
                    atol=1e-6, rtol=0)