Example #1
1
def test_zero_penalty():
    x, y, poly = multivariate_sample_data()
    alphas = [0, 0]
    gam_gs = GLMGam(y, smoother=poly, alpha=alphas)
    gam_gs_res = gam_gs.fit()
    y_est_gam = gam_gs_res.predict()

    glm = GLM(y, poly.basis).fit()
    y_est = glm.predict()

    assert_allclose(y_est, y_est_gam)
Example #2
0
def test_partial_plot():
    # verify that plot and partial_values method agree
    # the model only has one component so partial values is the same as
    # fittedvalues
    # Generate a plot to visualize analyze the result.

    cur_dir = os.path.dirname(os.path.abspath(__file__))
    file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv")

    data_from_r = pd.read_csv(file_path)

    # dataset used to train the R model
    x = data_from_r.x.values
    y = data_from_r.y.values
    se_from_mgcv = data_from_r.y_est_se  # noqa: F841
    df = [10]
    degree = [6]
    bsplines = BSplines(x, degree=degree, df=df)

    alpha = 0.03
    glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
    res_glm_gam = glm_gam.fit(maxiter=10000, method='bfgs')
    fig = res_glm_gam.plot_partial(0)
    xp, yp = fig.axes[0].get_children()[0].get_data()
    # Note xp and yp are sorted by x
    sort_idx = np.argsort(x)
    hat_y, se = res_glm_gam.partial_values(0)
    # assert that main plot line is the prediction
    assert_allclose(xp, x[sort_idx])
    assert_allclose(yp, hat_y[sort_idx])
Example #3
0
def test_cov_params():

    np.random.seed(0)
    n = 1000
    x = np.random.uniform(0, 1, (n, 2))
    x = x - x.mean()
    y = x[:, 0] * x[:, 0] + np.random.normal(0, .01, n)
    y -= y.mean()

    bsplines = BSplines(x, degree=[3] * 2, df=[10] * 2, constraints='center')
    alpha = [0, 0]
    glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
    res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0,
                              disp=0, maxiter=5000)
    glm = GLM(y, bsplines.basis)
    res_glm = glm.fit()

    assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(),
                    rtol=0.0025)

    alpha = 1e-13
    glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
    res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0,
                              disp=0, maxiter=5000)

    assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(),
                    atol=1e-10)

    res_glm_gam = glm_gam.fit(method='bfgs', max_start_irls=0,
                              disp=0, maxiter=5000, maxfun=5000)

    assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(),
                    rtol=1e-4, atol=1e-8)
Example #4
0
    def setup_class(cls):
        s_scale = 0.0263073404164214  # noqa: F841

        cc = CyclicCubicSplines(data_mcycle['times'].values, df=[6])
        gam_cc = GLMGam(data_mcycle['accel'], smoother=cc,
                        alpha=0)
        cls.res1 = gam_cc.fit(method='bfgs')
Example #5
0
    def setup_class(cls):
        s_scale = 0.0263073404164214

        cc = CyclicCubicSplines(data_mcycle['times'].values, df=[6])
        gam_cc = GLMGam(data_mcycle['accel'], smoother=cc,
                        alpha=1 / s_scale / 2)
        cls.res1 = gam_cc.fit()
Example #6
0
def test_partial_values2():
    np.random.seed(0)
    n = 1000
    x = np.random.uniform(0, 1, (n, 2))
    x = x - x.mean()
    y = x[:, 0] * x[:, 0] + np.random.normal(0, .01, n)
    y -= y.mean()
    alpha = 0.0
    # BUG: mask is incorrect if exog is not None, start_idx missing
    # bsplines = BSplines(x, degree=[3] * 2, df=[10] * 2)
    # glm_gam = GLMGam(y, exog=np.ones((len(y), 1)), smoother=bsplines,
    #                  alpha=alpha)
    bsplines = BSplines(x, degree=[3] * 2, df=[10] * 2,
                        include_intercept=[True, False])
    glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
    res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0,
                              disp=0, maxiter=5000)
    glm = GLM(y, bsplines.basis)  # noqa: F841

    # case with constant column in exog is currently wrong
    # ex = np.column_stack((np.zeros((len(y), 1)), bsplines.smoothers[0].basis,
    #                       np.zeros_like(bsplines.smoothers[1].basis) ))
    ex = np.column_stack((bsplines.smoothers[0].basis,
                          np.zeros_like(bsplines.smoothers[1].basis)))

    y_est = res_glm_gam.predict(ex, transform=False)
    y_partial_est, se = res_glm_gam.partial_values(0)

    assert_allclose(y_est, y_partial_est, atol=0.05)
    assert se.min() < 100
Example #7
0
def test_multivariate_gam_1d_data():
    cur_dir = os.path.dirname(os.path.abspath(__file__))
    file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv")
    data_from_r = pd.read_csv(file_path)
    # dataset used to train the R model
    x = data_from_r.x.values
    y = data_from_r.y

    df = [10]
    degree = [3]
    bsplines = BSplines(x, degree=degree, df=df)
    # y_mgcv is obtained from R with the following code
    # g = gam(y~s(x, k = 10, bs = "cr"), data = data, scale = 80)
    y_mgcv = data_from_r.y_est

    # alpha is by manually adjustment to reduce discrepancy in fittedvalues
    alpha = [0.0168 * 0.0251 / 2 * 500]
    gp = MultivariateGamPenalty(bsplines, alpha=alpha)    # noqa: F841

    glm_gam = GLMGam(y, exog=np.ones((len(y), 1)), smoother=bsplines,
                     alpha=alpha)
    # "nm" converges to a different params, "bfgs" params are close to pirls
    # res_glm_gam = glm_gam.fit(method='nm', max_start_irls=0,
    #                           disp=1, maxiter=10000, maxfun=5000)
    res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0,
                              disp=1, maxiter=10000)
    y_gam = res_glm_gam.fittedvalues

    # plt.plot(x, y_gam, '.', label='gam')
    # plt.plot(x, y_mgcv, '.', label='mgcv')
    # plt.plot(x, y, '.', label='y')
    # plt.legend()
    # plt.show()

    assert_allclose(y_gam, y_mgcv, atol=0.01)
Example #8
0
def test_multivariate_gam_1d_data():
    cur_dir = os.path.dirname(os.path.abspath(__file__))
    file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv")
    data_from_r = pd.read_csv(file_path)
    # dataset used to train the R model
    x = data_from_r.x.values
    y = data_from_r.y

    df = [10]
    degree = [3]
    bsplines = BSplines(x, degree=degree, df=df)
    # y_mgcv is obtained from R with the following code
    # g = gam(y~s(x, k = 10, bs = "cr"), data = data, scale = 80)
    y_mgcv = data_from_r.y_est

    # alpha is by manually adjustment to reduce discrepancy in fittedvalues
    alpha = [0.0168 * 0.0251 / 2 * 500]
    gp = MultivariateGamPenalty(bsplines, alpha=alpha)    # noqa: F841

    glm_gam = GLMGam(y, exog=np.ones((len(y), 1)), smoother=bsplines,
                     alpha=alpha)
    # "nm" converges to a different params, "bfgs" params are close to pirls
    # res_glm_gam = glm_gam.fit(method='nm', max_start_irls=0,
    #                           disp=1, maxiter=10000, maxfun=5000)
    res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0,
                              disp=1, maxiter=10000)
    y_gam = res_glm_gam.fittedvalues

    # plt.plot(x, y_gam, '.', label='gam')
    # plt.plot(x, y_mgcv, '.', label='mgcv')
    # plt.plot(x, y, '.', label='y')
    # plt.legend()
    # plt.show()

    assert_allclose(y_gam, y_mgcv, atol=0.01)
Example #9
0
def test_partial_values2():
    np.random.seed(0)
    n = 1000
    x = np.random.uniform(0, 1, (n, 2))
    x = x - x.mean()
    y = x[:, 0] * x[:, 0] + np.random.normal(0, .01, n)
    y -= y.mean()
    alpha = 0.0
    # BUG: mask is incorrect if exog is not None, start_idx missing
    # bsplines = BSplines(x, degree=[3] * 2, df=[10] * 2)
    # glm_gam = GLMGam(y, exog=np.ones((len(y), 1)), smoother=bsplines,
    #                  alpha=alpha)
    bsplines = BSplines(x,
                        degree=[3] * 2,
                        df=[10] * 2,
                        include_intercept=[True, False])
    glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
    res_glm_gam = glm_gam.fit(method='pirls',
                              max_start_irls=0,
                              disp=0,
                              maxiter=5000)
    glm = GLM(y, bsplines.basis)  # noqa: F841

    # case with constant column in exog is currently wrong
    # ex = np.column_stack((np.zeros((len(y), 1)), bsplines.smoothers[0].basis,
    #                       np.zeros_like(bsplines.smoothers[1].basis) ))
    ex = np.column_stack((bsplines.smoothers[0].basis,
                          np.zeros_like(bsplines.smoothers[1].basis)))

    y_est = res_glm_gam.predict(ex, transform=False)
    y_partial_est, se = res_glm_gam.partial_values(0)

    assert_allclose(y_est, y_partial_est, atol=0.05)
    assert se.min() < 100
Example #10
0
def test_partial_values():
    # this test is only approximate because we don't use the same spline
    # basis functions (knots) as mgcv
    cur_dir = os.path.dirname(os.path.abspath(__file__))
    file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv")

    data_from_r = pd.read_csv(file_path)

    # dataset used to train the R model
    x = data_from_r.x.values
    y = data_from_r.y.values
    se_from_mgcv = data_from_r.y_est_se
    df = [10]
    degree = [6]
    bsplines = BSplines(x, degree=degree, df=df, include_intercept=True)

    # TODO: alpha found by trial and error to pass assert
    alpha = 0.025 / 115 * 500
    glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
    res_glm_gam = glm_gam.fit(maxiter=10000, method='bfgs')
    # TODO: if IRLS is used res_glm_gam has not partial_values.

    univ_bsplines = bsplines.smoothers[0]  # noqa: F841
    hat_y, se = res_glm_gam.partial_values(0)

    assert_allclose(hat_y, data_from_r["y_est"], rtol=0, atol=0.008)
    # TODO: bug missing scale
    bug_fact = np.sqrt(res_glm_gam.scale) * 0.976  # this is = 0.106
    assert_allclose(se, se_from_mgcv * bug_fact, rtol=0, atol=0.008)
Example #11
0
    def setup_class(cls):

        sp = np.array([0.830689464223685, 425.361212061649])
        cls.s_scale = s_scale = np.array([2.443955e-06, 0.007945455])

        x_spline = df_autos[['weight', 'hp']].values
        # We need asarray to remove the design_info
        # If design_info is attached,
        #     then exog_linear will also be transformed in predict.
        cls.exog = np.asarray(patsy.dmatrix('fuel + drive', data=df_autos))
        bs = BSplines(x_spline, df=[12, 10], degree=[3, 3],
                      variable_names=['weight', 'hp'],
                      constraints='center',
                      include_intercept=True)
        # TODO alpha needs to be list
        alpha0 = 1 / s_scale * sp / 2
        gam_bs = GLMGam(df_autos['city_mpg'], exog=cls.exog, smoother=bs,
                        alpha=(alpha0).tolist())
        cls.res1a = gam_bs.fit(use_t=True)

        cls.res1b = gam_bs.fit(method='newton', use_t=True)
        cls.res1 = cls.res1a._results
        cls.res2 = results_mpg_bs.mpg_bs

        cls.rtol_fitted = 1e-8
        cls.covp_corrfact = 1  # not needed

        # for checking that alpha model attribute is unchanged, same as alpha0
        cls.alpha = [169947.78222669504, 26767.58046340008]
Example #12
0
def test_partial_plot():
    # verify that plot and partial_values method agree
    # the model only has one component so partial values is the same as
    # fittedvalues
    # Generate a plot to visualize analyze the result.

    cur_dir = os.path.dirname(os.path.abspath(__file__))
    file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv")

    data_from_r = pd.read_csv(file_path)

    # dataset used to train the R model
    x = data_from_r.x.values
    y = data_from_r.y.values
    se_from_mgcv = data_from_r.y_est_se  # noqa: F841
    df = [10]
    degree = [6]
    bsplines = BSplines(x, degree=degree, df=df)

    alpha = 0.03
    glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
    res_glm_gam = glm_gam.fit(maxiter=10000, method='bfgs')
    fig = res_glm_gam.plot_partial(0)
    xp, yp = fig.axes[0].get_children()[0].get_data()
    # Note xp and yp are sorted by x
    sort_idx = np.argsort(x)
    hat_y, se = res_glm_gam.partial_values(0)
    # assert that main plot line is the prediction
    assert_allclose(xp, x[sort_idx])
    assert_allclose(yp, hat_y[sort_idx])
Example #13
0
def test_gam_glm():
    cur_dir = os.path.dirname(os.path.abspath(__file__))
    file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv")
    data_from_r = pd.read_csv(file_path)
    # dataset used to train the R model
    x = data_from_r.x.values
    y = data_from_r.y.values

    df = [10]
    degree = [3]
    bsplines = BSplines(x, degree=degree, df=df, include_intercept=True)
    # y_mgcv is obtained from R with the following code
    # g = gam(y~s(x, k = 10, bs = "cr"), data = data, scale = 80)
    y_mgcv = np.asarray(data_from_r.y_est)

    alpha = 0.1  # chosen by trial and error

    glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
    res_glm_gam = glm_gam.fit(method='bfgs',
                              max_start_irls=0,
                              disp=1,
                              maxiter=10000,
                              maxfun=5000)
    y_gam0 = np.dot(bsplines.basis, res_glm_gam.params)
    y_gam = np.asarray(res_glm_gam.fittedvalues)
    assert_allclose(y_gam, y_gam0, rtol=1e-10)

    # plt.plot(x, y_gam, '.', label='gam')
    # plt.plot(x, y_mgcv, '.', label='mgcv')
    # plt.plot(x, y, '.', label='y')
    # plt.legend()
    # plt.show()

    assert_allclose(y_gam, y_mgcv, atol=1.e-2)
Example #14
0
def test_gam_glm():
    cur_dir = os.path.dirname(os.path.abspath(__file__))
    file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv")
    data_from_r = pd.read_csv(file_path)
    # dataset used to train the R model
    x = data_from_r.x.values
    y = data_from_r.y.values

    df = [10]
    degree = [3]
    bsplines = BSplines(x, degree=degree, df=df, include_intercept=True)
    # y_mgcv is obtained from R with the following code
    # g = gam(y~s(x, k = 10, bs = "cr"), data = data, scale = 80)
    y_mgcv = np.asarray(data_from_r.y_est)

    alpha = 0.1  # chosen by trial and error

    glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
    res_glm_gam = glm_gam.fit(method='bfgs', max_start_irls=0,
                              disp=1, maxiter=10000, maxfun=5000)
    y_gam0 = np.dot(bsplines.basis, res_glm_gam.params)
    y_gam = np.asarray(res_glm_gam.fittedvalues)
    assert_allclose(y_gam, y_gam0, rtol=1e-10)

    # plt.plot(x, y_gam, '.', label='gam')
    # plt.plot(x, y_mgcv, '.', label='mgcv')
    # plt.plot(x, y, '.', label='y')
    # plt.legend()
    # plt.show()

    assert_allclose(y_gam, y_mgcv, atol=1.e-2)
Example #15
0
def test_partial_values():
    # this test is only approximate because we don't use the same spline
    # basis functions (knots) as mgcv
    cur_dir = os.path.dirname(os.path.abspath(__file__))
    file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv")

    data_from_r = pd.read_csv(file_path)

    # dataset used to train the R model
    x = data_from_r.x.values
    y = data_from_r.y.values
    se_from_mgcv = data_from_r.y_est_se
    df = [10]
    degree = [6]
    bsplines = BSplines(x, degree=degree, df=df, include_intercept=True)

    # TODO: alpha found by trial and error to pass assert
    alpha = 0.025 / 115 * 500
    glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
    res_glm_gam = glm_gam.fit(maxiter=10000, method='bfgs')
    # TODO: if IRLS is used res_glm_gam has not partial_values.

    univ_bsplines = bsplines.smoothers[0]  # noqa: F841
    hat_y, se = res_glm_gam.partial_values(0)

    assert_allclose(hat_y, data_from_r["y_est"], rtol=0, atol=0.008)
    # TODO: bug missing scale
    bug_fact = np.sqrt(res_glm_gam.scale) * 0.976  # this is = 0.106
    assert_allclose(se, se_from_mgcv * bug_fact, rtol=0, atol=0.008)
Example #16
0
    def setup_class(cls):

        sp = np.array([40491.3940640059, 232455.530262537])
        # s_scale is same as before
        cls.s_scale = s_scale = np.array([2.443955e-06, 0.007945455])

        x_spline = df_autos[['weight', 'hp']].values
        cls.exog = patsy.dmatrix('fuel + drive', data=df_autos)
        bs = BSplines(x_spline, df=[12, 10], degree=[3, 3],
                      variable_names=['weight', 'hp'],
                      constraints='center',
                      include_intercept=True)
        # TODO alpha needs to be list
        alpha0 = 1 / s_scale * sp / 2
        gam_bs = GLMGam(df_autos['city_mpg'], exog=cls.exog, smoother=bs,
                        family=family.Poisson(), alpha=alpha0)

        xnames = cls.exog.design_info.column_names + gam_bs.smoother.col_names
        gam_bs.exog_names[:] = xnames
        cls.res1a = gam_bs.fit(use_t=False)

        cls.res1b = gam_bs.fit(method='newton', use_t=True)
        cls.res1 = cls.res1a._results
        cls.res2 = results_mpg_bs_poisson.mpg_bs_poisson

        cls.rtol_fitted = 1e-8
        cls.covp_corrfact = 1  # not needed
Example #17
0
    def setup_class(cls):

        sp = np.array([40491.3940640059, 232455.530262537])
        # s_scale is same as before
        cls.s_scale = s_scale = np.array([2.443955e-06, 0.007945455])

        x_spline = df_autos[['weight', 'hp']].values
        cls.exog = patsy.dmatrix('fuel + drive', data=df_autos)
        bs = BSplines(x_spline, df=[12, 10], degree=[3, 3],
                      variable_names=['weight', 'hp'],
                      constraints='center',
                      include_intercept=True)
        # TODO alpha needs to be list
        alpha0 = 1 / s_scale * sp / 2
        gam_bs = GLMGam(df_autos['city_mpg'], exog=cls.exog, smoother=bs,
                        family=family.Poisson(), alpha=alpha0)

        xnames = cls.exog.design_info.column_names + gam_bs.smoother.col_names
        gam_bs.exog_names[:] = xnames
        cls.res1a = gam_bs.fit(use_t=False)

        cls.res1b = gam_bs.fit(method='newton', use_t=True)
        cls.res1 = cls.res1a._results
        cls.res2 = results_mpg_bs_poisson.mpg_bs_poisson

        cls.rtol_fitted = 1e-8
        cls.covp_corrfact = 1  # not needed
Example #18
0
    def setup_class(cls):

        sp = np.array([0.830689464223685, 425.361212061649])
        cls.s_scale = s_scale = np.array([2.443955e-06, 0.007945455])

        x_spline = df_autos[['weight', 'hp']].values
        # We need asarray to remove the design_info
        # If design_info is attached,
        #     then exog_linear will also be transformed in predict.
        cls.exog = np.asarray(patsy.dmatrix('fuel + drive', data=df_autos))
        bs = BSplines(x_spline, df=[12, 10], degree=[3, 3],
                      variable_names=['weight', 'hp'],
                      constraints='center',
                      include_intercept=True)
        # TODO alpha needs to be list
        alpha0 = 1 / s_scale * sp / 2
        gam_bs = GLMGam(df_autos['city_mpg'], exog=cls.exog, smoother=bs,
                        alpha=(alpha0).tolist())
        cls.res1a = gam_bs.fit(use_t=True)

        cls.res1b = gam_bs.fit(method='newton', use_t=True)
        cls.res1 = cls.res1a._results
        cls.res2 = results_mpg_bs.mpg_bs

        cls.rtol_fitted = 1e-8
        cls.covp_corrfact = 1  # not needed

        # for checking that alpha model attribute is unchanged, same as alpha0
        cls.alpha = [169947.78222669504, 26767.58046340008]
Example #19
0
    def setup_class(cls):
        s_scale = 0.0263073404164214

        cc = CyclicCubicSplines(data_mcycle['times'].values, df=[6])
        gam_cc = GLMGam(data_mcycle['accel'], smoother=cc,
                        alpha=0)
        cls.res1 = gam_cc.fit(method='bfgs')
Example #20
0
 def setup_class(cls):
     s_scale = 0.0263073404164214
     nobs = data_mcycle['times'].shape[0]
     cc = CyclicCubicSplines(data_mcycle['times'].values, df=[6],
                             constraints='center')
     gam_cc = GLMGam(data_mcycle['accel'], np.ones((nobs, 1)),
                     smoother=cc, alpha=1 / s_scale / 2)
     cls.res1 = gam_cc.fit(method='pirls')
Example #21
0
 def setup_class(cls):
     s_scale = 0.0263073404164214
     nobs = data_mcycle['times'].shape[0]
     cc = CyclicCubicSplines(data_mcycle['times'].values, df=[6],
                             constraints='center')
     gam_cc = GLMGam(data_mcycle['accel'], np.ones((nobs, 1)),
                     smoother=cc, alpha=1 / s_scale / 2)
     cls.res1 = gam_cc.fit(method='pirls')
Example #22
0
def test_multivariate_gam_cv_path():
    def sample_metric(y1, y2):
        return np.linalg.norm(y1 - y2) / len(y1)

    cur_dir = os.path.dirname(os.path.abspath(__file__))
    file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv")

    data_from_r = pd.read_csv(file_path)

    # dataset used to train the R model
    x = data_from_r.x.values
    y = data_from_r.y.values
    se_from_mgcv = data_from_r.y_est_se  # noqa: F841
    y_mgcv = data_from_r.y_mgcv_gcv  # noqa: F841

    df = [10]
    degree = [6]

    bsplines = BSplines(x, degree=degree, df=df, include_intercept=True)

    gam = GLMGam
    alphas = [np.linspace(0, 2, 10)]
    k = 3
    cv = KFold(k_folds=k, shuffle=True)

    # Note: kfold cv uses random shuffle
    np.random.seed(123)
    gam_cv = MultivariateGAMCVPath(smoother=bsplines,
                                   alphas=alphas,
                                   gam=gam,
                                   cost=sample_metric,
                                   endog=y,
                                   exog=None,
                                   cv_iterator=cv)
    gam_cv_res = gam_cv.fit()  # noqa: F841

    glm_gam = GLMGam(y, smoother=bsplines, alpha=gam_cv.alpha_cv)
    res_glm_gam = glm_gam.fit(method='irls',
                              max_start_irls=0,
                              disp=1,
                              maxiter=10000)
    y_est = res_glm_gam.predict(bsplines.basis)

    # plt.plot(x, y, '.', label='y')
    # plt.plot(x, y_est, '.', label='y est')
    # plt.plot(x, y_mgcv, '.', label='y mgcv')
    # plt.legend()
    # plt.show()

    # The test compares to result obtained with GCV and not KFOLDS CV.
    # This is because MGCV does not support KFOLD CV
    assert_allclose(data_from_r.y_mgcv_gcv, y_est, atol=1.e-1, rtol=1.e-1)

    # Note: kfold cv uses random shuffle
    np.random.seed(123)
    alpha_cv, res_cv = glm_gam.select_penweight_kfold(alphas=alphas, k_folds=3)
    assert_allclose(alpha_cv, gam_cv.alpha_cv, rtol=1e-12)
Example #23
0
def test_zero_penalty():
    x, y, poly = multivariate_sample_data()
    alphas = [0, 0]
    gam_gs = GLMGam(y, smoother=poly, alpha=alphas)
    gam_gs_res = gam_gs.fit()
    y_est_gam = gam_gs_res.predict()

    glm = GLM(y, poly.basis).fit()
    y_est = glm.predict()

    assert_allclose(y_est, y_est_gam)
Example #24
0
def test_approximation():
    np.random.seed(1)
    poly, y = polynomial_sample_data()
    alpha = 1
    for _ in range(10):
        params = np.random.uniform(-1, 1, 4)
        cost, err, itg = cost_function(params, poly, y, alpha)
        glm_gam = GLMGam(y, smoother=poly, alpha=alpha)
        # TODO: why do we need pen_weight=1
        gam_loglike = glm_gam.loglike(params, scale=1, pen_weight=1)
        assert_allclose(err - itg, cost, rtol=1e-10)
        assert_allclose(gam_loglike, cost, rtol=0.1)
Example #25
0
def test_approximation():
    np.random.seed(1)
    poly, y = polynomial_sample_data()
    alpha = 1
    for _ in range(10):
        params = np.random.uniform(-1, 1, 4)
        cost, err, itg = cost_function(params, poly, y, alpha)
        glm_gam = GLMGam(y, smoother=poly, alpha=alpha)
        # TODO: why do we need pen_weight=1
        gam_loglike = glm_gam.loglike(params, scale=1, pen_weight=1)
        assert_allclose(err - itg, cost, rtol=1e-10)
        assert_allclose(gam_loglike, cost, rtol=0.1)
Example #26
0
def test_generic_smoother():
    x, y, poly = multivariate_sample_data()
    alphas = [0.4, 0.7]
    weights = [1, 1]  # noqa: F841

    gs = GenericSmoothers(poly.x, poly.smoothers)
    gam_gs = GLMGam(y, smoother=gs, alpha=alphas)
    gam_gs_res = gam_gs.fit()

    gam_poly = GLMGam(y, smoother=poly, alpha=alphas)
    gam_poly_res = gam_poly.fit()

    assert_allclose(gam_gs_res.params, gam_poly_res.params)
Example #27
0
def test_multivariate_gam_cv_path():
    def sample_metric(y1, y2):
        return np.linalg.norm(y1 - y2) / len(y1)

    cur_dir = os.path.dirname(os.path.abspath(__file__))
    file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv")

    data_from_r = pd.read_csv(file_path)

    # dataset used to train the R model
    x = data_from_r.x.values
    y = data_from_r.y.values
    se_from_mgcv = data_from_r.y_est_se  # noqa: F841
    y_mgcv = data_from_r.y_mgcv_gcv  # noqa: F841

    df = [10]
    degree = [6]

    bsplines = BSplines(x, degree=degree, df=df, include_intercept=True)

    gam = GLMGam
    alphas = [np.linspace(0, 2, 10)]
    k = 3
    cv = KFold(k_folds=k, shuffle=True)

    # Note: kfold cv uses random shuffle
    np.random.seed(123)
    gam_cv = MultivariateGAMCVPath(smoother=bsplines, alphas=alphas, gam=gam,
                                   cost=sample_metric, endog=y, exog=None,
                                   cv_iterator=cv)
    gam_cv_res = gam_cv.fit()  # noqa: F841

    glm_gam = GLMGam(y, smoother=bsplines, alpha=gam_cv.alpha_cv)
    res_glm_gam = glm_gam.fit(method='irls', max_start_irls=0,
                              disp=1, maxiter=10000)
    y_est = res_glm_gam.predict(bsplines.basis)

    # plt.plot(x, y, '.', label='y')
    # plt.plot(x, y_est, '.', label='y est')
    # plt.plot(x, y_mgcv, '.', label='y mgcv')
    # plt.legend()
    # plt.show()

    # The test compares to result obtained with GCV and not KFOLDS CV.
    # This is because MGCV does not support KFOLD CV
    assert_allclose(data_from_r.y_mgcv_gcv, y_est, atol=1.e-1, rtol=1.e-1)

    # Note: kfold cv uses random shuffle
    np.random.seed(123)
    alpha_cv, res_cv = glm_gam.select_penweight_kfold(alphas=alphas, k_folds=3)
    assert_allclose(alpha_cv, gam_cv.alpha_cv, rtol=1e-12)
Example #28
0
    def setup_class(cls):
        s_scale = 0.0263073404164214

        x = data_mcycle['times'].values
        endog = data_mcycle['accel']
        cc = CyclicCubicSplines(x, df=[6], constraints='center')
        gam_cc = GLMGam(endog, smoother=cc, alpha=1 / s_scale / 2)
        cls.res1 = gam_cc.fit(method='bfgs')
        cls.res2 = results_pls.pls5

        cls.rtol_fitted = 1e-5
        # cls.covp_corrfact = 1.0025464444310588  # without edf
        # edf is implemented
        cls.covp_corrfact = 1
Example #29
0
    def setup_class(cls):
        s_scale = 0.0263073404164214

        x = data_mcycle['times'].values
        endog = data_mcycle['accel']
        cc = CyclicCubicSplines(x, df=[6], constraints='center')
        gam_cc = GLMGam(endog, smoother=cc, alpha=1 / s_scale / 2)
        cls.res1 = gam_cc.fit(method='bfgs')
        cls.res2 = results_pls.pls5

        cls.rtol_fitted = 1e-5
        # cls.covp_corrfact = 1.0025464444310588  # without edf
        # edf is implemented
        cls.covp_corrfact = 1
Example #30
0
    def setup_class(cls):

        sp = np.array([6.46225497484073, 0.81532465890585])
        s_scale = np.array([2.95973613706629e-07, 0.000126203730141359])

        x_spline = df_autos[['weight', 'hp']].values
        exog = patsy.dmatrix('fuel + drive', data=df_autos)
        cc = CyclicCubicSplines(x_spline, df=[6, 5], constraints='center')
        # TODO alpha needs to be list
        gam_cc = GLMGam(df_autos['city_mpg'], exog=exog, smoother=cc,
                        alpha=(1 / s_scale * sp / 2).tolist())
        cls.res1a = gam_cc.fit()
        gam_cc = GLMGam(df_autos['city_mpg'], exog=exog, smoother=cc,
                        alpha=(1 / s_scale * sp / 2).tolist())
        cls.res1b = gam_cc.fit(method='newton')
Example #31
0
def test_cov_params():

    np.random.seed(0)
    n = 1000
    x = np.random.uniform(0, 1, (n, 2))
    x = x - x.mean()
    y = x[:, 0] * x[:, 0] + np.random.normal(0, .01, n)
    y -= y.mean()

    bsplines = BSplines(x, degree=[3] * 2, df=[10] * 2, constraints='center')
    alpha = [0, 0]
    glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
    res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0,
                              disp=0, maxiter=5000)
    glm = GLM(y, bsplines.basis)
    res_glm = glm.fit()

    assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(),
                    rtol=0.0025)

    alpha = 1e-13
    glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
    res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0,
                              disp=0, maxiter=5000)

    assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(),
                    atol=1e-10)

    res_glm_gam = glm_gam.fit(method='bfgs', max_start_irls=0,
                              disp=0, maxiter=5000, maxfun=5000)

    assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(),
                    rtol=1e-4, atol=1e-8)
Example #32
0
def test_glm_pirls_compatibility():
    np.random.seed(0)

    n = 500
    x1 = np.linspace(-3, 3, n)
    x2 = np.random.rand(n)

    x = np.vstack([x1, x2]).T
    y1 = np.sin(x1) / x1
    y2 = x2 * x2
    y0 = y1 + y2
    y = y0 + np.random.normal(0, .3, n)
    y -= y.mean()
    y0 -= y0.mean()

    # TODO: we have now alphas == alphas_glm
    alphas = [5.75] * 2
    alphas_glm = [1.2] * 2  # noqa: F841
    # using constraints avoids singular exog.
    cs = BSplines(x, df=[10, 10], degree=[3, 3], constraints='center')

    gam_pirls = GLMGam(y, smoother=cs, alpha=alphas)
    gam_glm = GLMGam(y, smoother=cs, alpha=alphas)

    gam_res_glm = gam_glm.fit(method='nm', max_start_irls=0,
                              disp=1, maxiter=20000, maxfun=10000)
    gam_res_glm = gam_glm.fit(start_params=gam_res_glm.params,
                              method='bfgs', max_start_irls=0,
                              disp=1, maxiter=20000, maxfun=10000)
    gam_res_pirls = gam_pirls.fit()

    y_est_glm = np.dot(cs.basis, gam_res_glm.params)
    y_est_glm -= y_est_glm.mean()
    y_est_pirls = np.dot(cs.basis, gam_res_pirls.params)
    y_est_pirls -= y_est_pirls.mean()

    # plt.plot(y_est_pirls)
    # plt.plot(y_est_glm)
    # plt.plot(y, '.')
    # plt.show()
    assert_allclose(gam_res_glm.params, gam_res_pirls.params, atol=5e-5,
                    rtol=5e-5)
    assert_allclose(y_est_glm, y_est_pirls, atol=5e-5)
Example #33
0
def test_cyclic_cubic_splines():
    cur_dir = os.path.dirname(os.path.abspath(__file__))
    file_path = os.path.join(cur_dir, "results",
                             "cubic_cyclic_splines_from_mgcv.csv")
    data_from_r = pd.read_csv(file_path)

    x = data_from_r[['x0', 'x2']].values
    y = data_from_r['y'].values
    y_est_mgcv = data_from_r[['y_est']].values  # noqa: F841
    s_mgcv = data_from_r[['s(x0)', 's(x2)']].values

    dfs = [10, 10]
    ccs = CyclicCubicSplines(x, df=dfs)
    alpha = [0.05 / 2, 0.0005 / 2]
    # TODO: if alpha changes in pirls this should be updated

    gam = GLMGam(y, smoother=ccs, alpha=alpha)
    gam_res = gam.fit(method='pirls')

    s0 = np.dot(ccs.basis[:, ccs.mask[0]],
                gam_res.params[ccs.mask[0]])
    # TODO: Mean has to be removed
    # removing mean could be replaced by options for intercept handling
    s0 -= s0.mean()

    s1 = np.dot(ccs.basis[:, ccs.mask[1]],
                gam_res.params[ccs.mask[1]])
    s1 -= s1.mean()  # TODO: Mean has to be removed

    # plt.subplot(2, 1, 1)
    # plt.plot(x[:, 0], s0, '.', label='s0')
    # plt.plot(x[:, 0], s_mgcv[:, 0], '.', label='s0_mgcv')
    # plt.legend(loc='best')
    #
    # plt.subplot(2, 1, 2)
    # plt.plot(x[:, 1], s1, '.', label='s1_est')
    # plt.plot(x[:, 1], s_mgcv[:, 1], '.', label='s1_mgcv')
    # plt.legend(loc='best')
    # plt.show()

    assert_allclose(s0, s_mgcv[:, 0], atol=0.02)
    assert_allclose(s1, s_mgcv[:, 1], atol=0.33)
Example #34
0
def test_cyclic_cubic_splines():
    cur_dir = os.path.dirname(os.path.abspath(__file__))
    file_path = os.path.join(cur_dir, "results",
                             "cubic_cyclic_splines_from_mgcv.csv")
    data_from_r = pd.read_csv(file_path)

    x = data_from_r[['x0', 'x2']].values
    y = data_from_r['y'].values
    y_est_mgcv = data_from_r[['y_est']].values  # noqa: F841
    s_mgcv = data_from_r[['s(x0)', 's(x2)']].values

    dfs = [10, 10]
    ccs = CyclicCubicSplines(x, df=dfs)
    alpha = [0.05 / 2, 0.0005 / 2]
    # TODO: if alpha changes in pirls this should be updated

    gam = GLMGam(y, smoother=ccs, alpha=alpha)
    gam_res = gam.fit(method='pirls')

    s0 = np.dot(ccs.basis[:, ccs.mask[0]],
                gam_res.params[ccs.mask[0]])
    # TODO: Mean has to be removed
    # removing mean could be replaced by options for intercept handling
    s0 -= s0.mean()

    s1 = np.dot(ccs.basis[:, ccs.mask[1]],
                gam_res.params[ccs.mask[1]])
    s1 -= s1.mean()  # TODO: Mean has to be removed

    # plt.subplot(2, 1, 1)
    # plt.plot(x[:, 0], s0, '.', label='s0')
    # plt.plot(x[:, 0], s_mgcv[:, 0], '.', label='s0_mgcv')
    # plt.legend(loc='best')
    #
    # plt.subplot(2, 1, 2)
    # plt.plot(x[:, 1], s1, '.', label='s1_est')
    # plt.plot(x[:, 1], s_mgcv[:, 1], '.', label='s1_mgcv')
    # plt.legend(loc='best')
    # plt.show()

    assert_allclose(s0, s_mgcv[:, 0], atol=0.02)
    assert_allclose(s1, s_mgcv[:, 1], atol=0.33)
Example #35
0
def test_multivariate_cubic_splines():
    np.random.seed(0)
    from statsmodels.gam.smooth_basis import CubicSplines

    n = 500
    x1 = np.linspace(-3, 3, n)
    x2 = np.linspace(0, 1, n)**2

    x = np.vstack([x1, x2]).T
    y1 = np.sin(x1) / x1
    y2 = x2 * x2
    y0 = y1 + y2
    # need small enough noise variance to get good estimate for this test
    y = y0 + np.random.normal(0, .3 / 2, n)
    y -= y.mean()
    y0 -= y0.mean()

    alphas = [1e-3, 1e-3]
    cs = CubicSplines(x, df=[10, 10], constraints='center')

    gam = GLMGam(y, exog=np.ones((n, 1)), smoother=cs, alpha=alphas)
    gam_res = gam.fit(method='pirls')

    y_est = gam_res.fittedvalues
    y_est -= y_est.mean()

    # cut the tails
    index = list(range(50, n - 50))
    y_est = y_est[index]
    y0 = y0[index]
    y = y[index]

    # plt.plot(y_est, label='y est')
    # plt.plot(y0, label='y0')
    # plt.plot(y, '.', label='y')
    # plt.legend(loc='best')
    # plt.show()

    assert_allclose(y_est, y0, atol=0.04)
Example #36
0
def test_multivariate_cubic_splines():
    np.random.seed(0)
    from statsmodels.gam.smooth_basis import CubicSplines

    n = 500
    x1 = np.linspace(-3, 3, n)
    x2 = np.linspace(0, 1, n)**2

    x = np.vstack([x1, x2]).T
    y1 = np.sin(x1) / x1
    y2 = x2 * x2
    y0 = y1 + y2
    # need small enough noise variance to get good estimate for this test
    y = y0 + np.random.normal(0, .3 / 2, n)
    y -= y.mean()
    y0 -= y0.mean()

    alphas = [1e-3, 1e-3]
    cs = CubicSplines(x, df=[10, 10], constraints='center')

    gam = GLMGam(y, exog=np.ones((n, 1)), smoother=cs, alpha=alphas)
    gam_res = gam.fit(method='pirls')

    y_est = gam_res.fittedvalues
    y_est -= y_est.mean()

    # cut the tails
    index = list(range(50, n - 50))
    y_est = y_est[index]
    y0 = y0[index]
    y = y[index]

    # plt.plot(y_est, label='y est')
    # plt.plot(y0, label='y0')
    # plt.plot(y, '.', label='y')
    # plt.legend(loc='best')
    # plt.show()

    assert_allclose(y_est, y0, atol=0.04)
Example #37
0
def test_glm_pirls_compatibility():
    np.random.seed(0)

    n = 500
    x1 = np.linspace(-3, 3, n)
    x2 = np.random.rand(n)

    x = np.vstack([x1, x2]).T
    y1 = np.sin(x1) / x1
    y2 = x2 * x2
    y0 = y1 + y2
    y = y0 + np.random.normal(0, .3, n)
    y -= y.mean()
    y0 -= y0.mean()

    # TODO: we have now alphas == alphas_glm
    alphas = [5.75] * 2
    alphas_glm = [1.2] * 2  # noqa: F841
    # using constraints avoids singular exog.
    cs = BSplines(x, df=[10, 10], degree=[3, 3], constraints='center')

    gam_pirls = GLMGam(y, smoother=cs, alpha=alphas)
    gam_glm = GLMGam(y, smoother=cs, alpha=alphas)

    gam_res_glm = gam_glm.fit(method='nm',
                              max_start_irls=0,
                              disp=1,
                              maxiter=20000,
                              maxfun=10000)
    gam_res_glm = gam_glm.fit(start_params=gam_res_glm.params,
                              method='bfgs',
                              max_start_irls=0,
                              disp=1,
                              maxiter=20000,
                              maxfun=10000)
    gam_res_pirls = gam_pirls.fit()

    y_est_glm = np.dot(cs.basis, gam_res_glm.params)
    y_est_glm -= y_est_glm.mean()
    y_est_pirls = np.dot(cs.basis, gam_res_pirls.params)
    y_est_pirls -= y_est_pirls.mean()

    # plt.plot(y_est_pirls)
    # plt.plot(y_est_glm)
    # plt.plot(y, '.')
    # plt.show()
    assert_allclose(gam_res_glm.params,
                    gam_res_pirls.params,
                    atol=5e-5,
                    rtol=5e-5)
    assert_allclose(y_est_glm, y_est_pirls, atol=5e-5)
Example #38
0
def test_generic_smoother():
    x, y, poly = multivariate_sample_data()
    alphas = [0.4, 0.7]
    weights = [1, 1]  # noqa: F841

    gs = GenericSmoothers(poly.x, poly.smoothers)
    gam_gs = GLMGam(y, smoother=gs, alpha=alphas)
    gam_gs_res = gam_gs.fit()

    gam_poly = GLMGam(y, smoother=poly, alpha=alphas)
    gam_poly_res = gam_poly.fit()

    assert_allclose(gam_gs_res.params, gam_poly_res.params)
Example #39
0
    def setup_class(cls):

        sp = np.array([6.46225497484073, 0.81532465890585])
        s_scale = np.array([2.95973613706629e-07, 0.000126203730141359])

        x_spline = df_autos[['weight', 'hp']].values
        exog = patsy.dmatrix('fuel + drive', data=df_autos)
        cc = CyclicCubicSplines(x_spline, df=[6, 5], constraints='center')
        # TODO alpha needs to be list
        gam_cc = GLMGam(df_autos['city_mpg'], exog=exog, smoother=cc,
                        alpha=(1 / s_scale * sp / 2).tolist())
        cls.res1a = gam_cc.fit()
        gam_cc = GLMGam(df_autos['city_mpg'], exog=exog, smoother=cc,
                        alpha=(1 / s_scale * sp / 2).tolist())
        cls.res1b = gam_cc.fit(method='newton')
Example #40
0
def decompose(x, transform=True):
    # Decompose data into trend, seasonality and randomness
    # Accepts a pandas series object with a datetime index
    if (transform and min(x.dropna()) >= 0):
        # Transforms data and finds the lambda that maximizes the log likelihood
        # R version has above method and method that minimizes the coefficient of variation ("guerrero")
        x_transformed, var_lambda = boxcox(na_contiguous(x), lmbda=None)
        x_transformed = pd.Series(x_transformed, index=na_contiguous(x).index)

    else:
        x_transformed = x
        var_lambda = np.nan
        transform = False

    # Seasonal data
    # In R code, we find the number of samples per unit time below (should be 1 every time)
    # Here I take the datetime index differences, take their inverses, and store in a list to be evaluated
    # https://stackoverflow.com/questions/36583859/compute-time-difference-of-datetimeindex
    idx = x_transformed.index
    #samples = np.unique([int(1/(idx[n]-idx[n - 1]).days) for n in range(1,len(idx))])
    # Filter out Nulls for this exercise
    #samples = samples[~np.isnan(samples)]
    #if len(samples) == 1 and samples.item() > 1:

    # Just use the R code instead
    # This is supposed to be "> 1" but all data results in a frequency of 1
    # All frequency results in R equal 4, meaning this code block gets evaluated every time in R
    # So this code block should always be evaluated as well
    if int(rstats.frequency(x_transformed).item()) == 1:
        # Decompose
        stl = sm.tsa.seasonal_decompose(na_contiguous(x_transformed))
        #stl = rstats.stl(na_contiguous(x_transformed),s_window='periodic')
        # When I try to use above function, I get this:
        '''
        R[write to console]: Error in (function (x, s.window, s.degree = 0, t.window = NULL, t.degree = 1,  : 
  series is not periodic or has less than two periods
        '''
        trend = stl.trend
        seasonality = stl.seasonal
        remainder = x_transformed - trend - seasonality

    else:
        # Nonseasonal data
        trend = pd.Series(np.nan, index=x_transformed.index)
        time_index = pd.Index([i for i in range(1, len(x_transformed) + 1)])
        # Python specific
        bs = BSplines(time_index, df=[12, 10], degree=[3, 3])
        cs = CyclicCubicSplines(time_index, df=[3, 3])
        alpha = np.array([218.338888])
        gam = GLMGam(x_transformed, smoother=cs, alpha=alpha).fit()
        #trend.loc[~x_transformed.isnull()] = gam.fittedvalues

        # R Code
        fmla = Formula('x ~ s(tt)')
        env = fmla.environment
        env['tt'] = time_index
        env['x'] = x_transformed
        trend.loc[~x_transformed.isnull()] = rstats.fitted(rmgcv.gam(fmla))
        seasonality = pd.Series(np.nan, index=x_transformed.index)
        remainder = x_transformed - trend

    return_dct = {
        'x': x_transformed,
        'trend': trend,
        'seasonality': seasonality,
        'remainder': remainder,
        'transform': transform,
        'lambda': var_lambda,
    }

    return return_dct