def test_n_estimators(make_whas500): whas500 = make_whas500(with_mean=False, with_std=False, to_numeric=True) base_estimators = [('gbm', ComponentwiseGradientBoostingSurvivalAnalysis()), ('svm', FastSurvivalSVM())] meta = EnsembleSelection(base_estimators, scorer=score_cindex, n_estimators=0) with pytest.raises(ValueError, match="n_estimators must not be zero or negative"): meta.fit(whas500.x, whas500.y) meta.set_params(n_estimators=1000) with pytest.raises( ValueError, match=r"n_estimators \(1000\) must not exceed number " r"of base learners \(2\)"): meta.fit(whas500.x, whas500.y)
def test_min_correlation(self): base_estimators = [('gbm', ComponentwiseGradientBoostingSurvivalAnalysis()), ('svm', FastSurvivalSVM())] meta = EnsembleSelection(base_estimators, scorer=score_cindex, min_correlation=1.2) self.assertRaisesRegex( ValueError, "min_correlation must be in \[-1; 1\], but was 1.2", meta.fit, self.x, self.y) meta.set_params(min_correlation=-2.1) self.assertRaisesRegex( ValueError, "min_correlation must be in \[-1; 1\], but was -2.1", meta.fit, self.x, self.y) meta.set_params(min_correlation=numpy.nan) self.assertRaisesRegex( ValueError, "min_correlation must be in \[-1; 1\], but was nan", meta.fit, self.x, self.y)
def test_correlation(make_whas500): whas500 = make_whas500(with_mean=False, with_std=False, to_numeric=True) base_estimators = [('gbm', ComponentwiseGradientBoostingSurvivalAnalysis()), ('svm', FastSurvivalSVM())] meta = EnsembleSelection(base_estimators, scorer=score_cindex, correlation=None) with pytest.raises(ValueError, match="correlation must be one of 'pearson', 'kendall', " "and 'spearman', but got None"): meta.fit(whas500.x, whas500.y) meta = EnsembleSelection(base_estimators, scorer=score_cindex, correlation=2143) with pytest.raises(ValueError, match="correlation must be one of 'pearson', 'kendall', " "and 'spearman', but got 2143"): meta.fit(whas500.x, whas500.y) meta = EnsembleSelection(base_estimators, scorer=score_cindex, correlation="clearly wrong") with pytest.raises(ValueError, match="correlation must be one of 'pearson', 'kendall', " "and 'spearman', but got 'clearly wrong'"): meta.fit(whas500.x, whas500.y)
def test_squared_loss(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = ComponentwiseGradientBoostingSurvivalAnalysis(loss="squared", n_estimators=100, random_state=0) model.fit(whas500_data.x, whas500_data.y) time_predicted = model.predict(whas500_data.x) time_true = whas500_data.y["lenfol"] event_true = whas500_data.y["fstat"] rmse_all = numpy.sqrt(mean_squared_error(time_true, time_predicted)) assert round(abs(rmse_all - 793.6256945839657), 7) == 0 rmse_uncensored = numpy.sqrt(mean_squared_error(time_true[event_true], time_predicted[event_true])) assert round(abs(rmse_uncensored - 542.83358120153525), 7) == 0 cindex = model.score(whas500_data.x, whas500_data.y) assert round(abs(cindex - 0.7777082862), 7) == 0
def _create_ensemble(self, **kwargs): boosting_grid = ParameterGrid({ "n_estimators": [100, 250], "subsample": [1.0, 0.75, 0.5] }) svm_grid = ParameterGrid({"alpha": 2.**numpy.arange(-9, 5, 2)}) base_estimators = [] for i, params in enumerate(boosting_grid): model = ComponentwiseGradientBoostingSurvivalAnalysis( random_state=0, **params) base_estimators.append(("gbm_%d" % i, model)) for i, params in enumerate(svm_grid): model = FastSurvivalSVM(max_iter=100, random_state=0, **params) base_estimators.append(("svm_%d" % i, model)) cv = KFold(n_splits=4, shuffle=True, random_state=0) meta = EnsembleSelection(base_estimators, n_estimators=0.4, scorer=score_cindex, cv=cv, **kwargs) return meta
def test_fit_verbose(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = ComponentwiseGradientBoostingSurvivalAnalysis(n_estimators=10, verbose=1, random_state=0) model.fit(whas500_data.x, whas500_data.y)
def test_fit_verbose(self): model = ComponentwiseGradientBoostingSurvivalAnalysis(n_estimators=10, verbose=1, random_state=0) model.fit(self.x, self.y)
def test_feature_importances(self): model = ComponentwiseGradientBoostingSurvivalAnalysis(n_estimators=100, random_state=0) model.fit(self.x, self.y) self.assertEqual(self.x.shape[1] + 1, len(model.feature_importances_))
# data_y # df.groupby('status').count() # Part 2: Componentwise Gradient Boosting for Survival Analysis from sklearn.model_selection import ShuffleSplit, GridSearchCV from sksurv.column import encode_categorical from sksurv.metrics import concordance_index_censored from sksurv.ensemble import ComponentwiseGradientBoostingSurvivalAnalysis import warnings warnings.filterwarnings("ignore", category=UserWarning) ## create estimator estimator = ComponentwiseGradientBoostingSurvivalAnalysis(loss='coxph', random_state=0) ## define a function for evaluating the performance of models during grid search using Harrell's concordance index def score_survival_model(model, X, y): prediction = model.predict(X) result = concordance_index_censored(y['status'], y['time_to_event'], prediction) return result[0] param_grid = { 'learning_rate': [0.001, 0.01, 0.1, 0.5, 1], 'n_estimators': [100, 200, 500, 1000] }