def select_penweight_kfold(self, alphas=None, cv_iterator=None, cost=None, k_folds=5, k_grid=11): """find alphas by k-fold cross-validation Warning: This estimates ``k_folds`` models for each point in the grid of alphas. Parameters ---------- alphas : None or list of arrays cv_iterator : instance instance of a cross-validation iterator, by default this is a KFold instance cost : function default is mean squared error. The cost function to evaluate the prediction error for the left out sample. This should take two arrays as argument and return one float. k_folds : int number of folds if default Kfold iterator is used. This is ignored if ``cv_iterator`` is not None. Returns ------- alpha_cv : list of float Best alpha in grid according to cross-validation res_cv : instance of MultivariateGAMCVPath The instance was used for cross-validation and holds the results Notes ----- The default alphas are defined as ``alphas = [np.logspace(0, 7, k_grid) for _ in range(k_smooths)]`` """ if cost is None: def cost(x1, x2): return np.linalg.norm(x1 - x2) / len(x1) if alphas is None: alphas = [np.logspace(0, 7, k_grid) for _ in range(self.k_smooths)] if cv_iterator is None: cv_iterator = KFold(k_folds=k_folds, shuffle=True) gam_cv = MultivariateGAMCVPath(smoother=self.smoother, alphas=alphas, gam=GLMGam, cost=cost, endog=self.endog, exog=self.exog_linear, cv_iterator=cv_iterator) gam_cv_res = gam_cv.fit() return gam_cv_res.alpha_cv, gam_cv_res
def test_multivariate_gam_cv(): # SMOKE test # no test is performed. It only checks that there isn't any runtime error def cost(x1, x2): return np.linalg.norm(x1 - x2) / len(x1) cur_dir = os.path.dirname(os.path.abspath(__file__)) file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv") data_from_r = pd.read_csv(file_path) # dataset used to train the R model x = data_from_r.x.values y = data_from_r.y.values df = [10] degree = [5] bsplines = BSplines(x, degree=degree, df=df) # y_mgcv is obtained from R with the following code # g = gam(y~s(x, k = 10, bs = "cr"), data = data, scale = 80) alphas = [0.0251] alphas = [2] cv = KFold(3) gp = MultivariateGamPenalty(bsplines, alpha=alphas) # noqa: F841 gam_cv = MultivariateGAMCV(smoother=bsplines, alphas=alphas, gam=GLMGam, cost=cost, endog=y, exog=None, cv_iterator=cv) gam_cv_res = gam_cv.fit() # noqa: F841
def test_multivariate_gam_cv_path(): def sample_metric(y1, y2): return np.linalg.norm(y1 - y2) / len(y1) cur_dir = os.path.dirname(os.path.abspath(__file__)) file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv") data_from_r = pd.read_csv(file_path) # dataset used to train the R model x = data_from_r.x.values y = data_from_r.y.values se_from_mgcv = data_from_r.y_est_se # noqa: F841 y_mgcv = data_from_r.y_mgcv_gcv # noqa: F841 df = [10] degree = [6] bsplines = BSplines(x, degree=degree, df=df, include_intercept=True) gam = GLMGam alphas = [np.linspace(0, 2, 10)] k = 3 cv = KFold(k_folds=k, shuffle=True) # Note: kfold cv uses random shuffle np.random.seed(123) gam_cv = MultivariateGAMCVPath(smoother=bsplines, alphas=alphas, gam=gam, cost=sample_metric, endog=y, exog=None, cv_iterator=cv) gam_cv_res = gam_cv.fit() # noqa: F841 glm_gam = GLMGam(y, smoother=bsplines, alpha=gam_cv.alpha_cv) res_glm_gam = glm_gam.fit(method='irls', max_start_irls=0, disp=1, maxiter=10000) y_est = res_glm_gam.predict(bsplines.basis) # plt.plot(x, y, '.', label='y') # plt.plot(x, y_est, '.', label='y est') # plt.plot(x, y_mgcv, '.', label='y mgcv') # plt.legend() # plt.show() # The test compares to result obtained with GCV and not KFOLDS CV. # This is because MGCV does not support KFOLD CV assert_allclose(data_from_r.y_mgcv_gcv, y_est, atol=1.e-1, rtol=1.e-1) # Note: kfold cv uses random shuffle np.random.seed(123) alpha_cv, res_cv = glm_gam.select_penweight_kfold(alphas=alphas, k_folds=3) assert_allclose(alpha_cv, gam_cv.alpha_cv, rtol=1e-12)