def select_penweight_kfold(self,
                               alphas=None,
                               cv_iterator=None,
                               cost=None,
                               k_folds=5,
                               k_grid=11):
        """find alphas by k-fold cross-validation

        Warning: This estimates ``k_folds`` models for each point in the
            grid of alphas.

        Parameters
        ----------
        alphas : None or list of arrays
        cv_iterator : instance
            instance of a cross-validation iterator, by default this is a
            KFold instance
        cost : function
            default is mean squared error. The cost function to evaluate the
            prediction error for the left out sample. This should take two
            arrays as argument and return one float.
        k_folds : int
            number of folds if default Kfold iterator is used.
            This is ignored if ``cv_iterator`` is not None.

        Returns
        -------
        alpha_cv : list of float
            Best alpha in grid according to cross-validation
        res_cv : instance of MultivariateGAMCVPath
            The instance was used for cross-validation and holds the results

        Notes
        -----
        The default alphas are defined as
        ``alphas = [np.logspace(0, 7, k_grid) for _ in range(k_smooths)]``

        """

        if cost is None:

            def cost(x1, x2):
                return np.linalg.norm(x1 - x2) / len(x1)

        if alphas is None:
            alphas = [np.logspace(0, 7, k_grid) for _ in range(self.k_smooths)]

        if cv_iterator is None:
            cv_iterator = KFold(k_folds=k_folds, shuffle=True)

        gam_cv = MultivariateGAMCVPath(smoother=self.smoother,
                                       alphas=alphas,
                                       gam=GLMGam,
                                       cost=cost,
                                       endog=self.endog,
                                       exog=self.exog_linear,
                                       cv_iterator=cv_iterator)
        gam_cv_res = gam_cv.fit()

        return gam_cv_res.alpha_cv, gam_cv_res
Ejemplo n.º 2
0
def test_multivariate_gam_cv():
    # SMOKE test
    # no test is performed. It only checks that there isn't any runtime error

    def cost(x1, x2):
        return np.linalg.norm(x1 - x2) / len(x1)

    cur_dir = os.path.dirname(os.path.abspath(__file__))
    file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv")
    data_from_r = pd.read_csv(file_path)
    # dataset used to train the R model
    x = data_from_r.x.values
    y = data_from_r.y.values

    df = [10]
    degree = [5]
    bsplines = BSplines(x, degree=degree, df=df)
    # y_mgcv is obtained from R with the following code
    # g = gam(y~s(x, k = 10, bs = "cr"), data = data, scale = 80)

    alphas = [0.0251]
    alphas = [2]
    cv = KFold(3)

    gp = MultivariateGamPenalty(bsplines, alpha=alphas)  # noqa: F841
    gam_cv = MultivariateGAMCV(smoother=bsplines,
                               alphas=alphas,
                               gam=GLMGam,
                               cost=cost,
                               endog=y,
                               exog=None,
                               cv_iterator=cv)
    gam_cv_res = gam_cv.fit()  # noqa: F841
Ejemplo n.º 3
0
def test_multivariate_gam_cv_path():
    def sample_metric(y1, y2):
        return np.linalg.norm(y1 - y2) / len(y1)

    cur_dir = os.path.dirname(os.path.abspath(__file__))
    file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv")

    data_from_r = pd.read_csv(file_path)

    # dataset used to train the R model
    x = data_from_r.x.values
    y = data_from_r.y.values
    se_from_mgcv = data_from_r.y_est_se  # noqa: F841
    y_mgcv = data_from_r.y_mgcv_gcv  # noqa: F841

    df = [10]
    degree = [6]

    bsplines = BSplines(x, degree=degree, df=df, include_intercept=True)

    gam = GLMGam
    alphas = [np.linspace(0, 2, 10)]
    k = 3
    cv = KFold(k_folds=k, shuffle=True)

    # Note: kfold cv uses random shuffle
    np.random.seed(123)
    gam_cv = MultivariateGAMCVPath(smoother=bsplines,
                                   alphas=alphas,
                                   gam=gam,
                                   cost=sample_metric,
                                   endog=y,
                                   exog=None,
                                   cv_iterator=cv)
    gam_cv_res = gam_cv.fit()  # noqa: F841

    glm_gam = GLMGam(y, smoother=bsplines, alpha=gam_cv.alpha_cv)
    res_glm_gam = glm_gam.fit(method='irls',
                              max_start_irls=0,
                              disp=1,
                              maxiter=10000)
    y_est = res_glm_gam.predict(bsplines.basis)

    # plt.plot(x, y, '.', label='y')
    # plt.plot(x, y_est, '.', label='y est')
    # plt.plot(x, y_mgcv, '.', label='y mgcv')
    # plt.legend()
    # plt.show()

    # The test compares to result obtained with GCV and not KFOLDS CV.
    # This is because MGCV does not support KFOLD CV
    assert_allclose(data_from_r.y_mgcv_gcv, y_est, atol=1.e-1, rtol=1.e-1)

    # Note: kfold cv uses random shuffle
    np.random.seed(123)
    alpha_cv, res_cv = glm_gam.select_penweight_kfold(alphas=alphas, k_folds=3)
    assert_allclose(alpha_cv, gam_cv.alpha_cv, rtol=1e-12)