def test_squared_loss_staged_predict(self): # Test whether staged decision function eventually gives # the same prediction. model = GradientBoostingSurvivalAnalysis(loss="squared", n_estimators=100, max_depth=3, random_state=0) model.fit(self.x, self.y) y_pred = model.predict(self.x) # test if prediction for last stage equals ``predict`` for y in model.staged_predict(self.x): self.assertTupleEqual(y.shape, y_pred.shape) assert_array_equal(y_pred, y) model.set_params(dropout_rate=0.03) model.fit(self.x, self.y) y_pred = model.predict(self.x) # test if prediction for last stage equals ``predict`` for y in model.staged_predict(self.x): self.assertTupleEqual(y.shape, y_pred.shape) assert_array_equal(y_pred, y)
def test_fit_verbose(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(n_estimators=10, verbose=1, random_state=0) model.fit(whas500_data.x, whas500_data.y)
def test_squared_loss_staged_predict(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) # Test whether staged decision function eventually gives # the same prediction. model = GradientBoostingSurvivalAnalysis(loss="squared", n_estimators=100, max_depth=3, random_state=0) model.fit(whas500_data.x, whas500_data.y) y_pred = model.predict(whas500_data.x) # test if prediction for last stage equals ``predict`` for y in model.staged_predict(whas500_data.x): assert y.shape == y_pred.shape assert_array_equal(y_pred, y) model.set_params(dropout_rate=0.03) model.fit(whas500_data.x, whas500_data.y) y_pred = model.predict(whas500_data.x) # test if prediction for last stage equals ``predict`` for y in model.staged_predict(whas500_data.x): assert y.shape == y_pred.shape assert_array_equal(y_pred, y)
def test_fit_subsample(self): model = GradientBoostingSurvivalAnalysis(n_estimators=100, max_features=8, subsample=0.6, random_state=0) model.fit(self.x, self.y) self.assertEquals(model.max_features_, 8) self.assertTrue(hasattr(model, "oob_improvement_")) incl_mask = numpy.ones(self.x.shape[0], dtype=bool) incl_mask[[35, 111, 174, 206, 236, 268, 497]] = False x_test = self.x[incl_mask] y_test = self.y[incl_mask] p = model.predict(x_test) expected_cindex = numpy.array([0.8592640, 62905, 10303, 0, 110]) result = concordance_index_censored(y_test['fstat'], y_test['lenfol'], p) assert_array_almost_equal(expected_cindex, numpy.array(result)) self.assertTupleEqual((100, ), model.train_score_.shape) self.assertTupleEqual((100, ), model.oob_improvement_.shape) self.assertRaisesRegex( ValueError, "Number of features of the model must match the input. " "Model n_features is 14 and input n_features is 2 ", model.predict, self.x[:, :2])
def test_presort(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(n_estimators=10, presort=None, random_state=0) with pytest.raises(ValueError, match=r"'presort' should be in \('auto', True, False\). Got None instead."): model.fit(whas500_data.x, whas500_data.y)
def test_squared_loss(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(loss="squared", n_estimators=100, max_depth=3, random_state=0) model.fit(whas500_data.x, whas500_data.y) time_predicted = model.predict(whas500_data.x) time_true = whas500_data.y["lenfol"] event_true = whas500_data.y["fstat"] rmse_all = numpy.sqrt(mean_squared_error(time_true, time_predicted)) assert round(abs(rmse_all - 580.23345259002951), 7) == 0 rmse_uncensored = numpy.sqrt( mean_squared_error(time_true[event_true], time_predicted[event_true])) assert round(abs(rmse_uncensored - 383.10639243317951), 7) == 0 cindex = model.score(whas500_data.x, whas500_data.y) assert round(abs(cindex - 0.9021810004), 7) == 0 with pytest.raises( ValueError, match="`fit` must be called with the loss option set to 'coxph'" ): model.predict_survival_function(whas500_data.x) with pytest.raises( ValueError, match="`fit` must be called with the loss option set to 'coxph'" ): model.predict_cumulative_hazard_function(whas500_data.x)
def test_fit(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(n_estimators=100, max_depth=3, min_samples_split=10, random_state=0) model.fit(whas500_data.x, whas500_data.y) assert model.max_features_ == 14 assert not hasattr(model, "oob_improvement_") p = model.predict(whas500_data.x) assert_cindex_almost_equal(whas500_data.y['fstat'], whas500_data.y['lenfol'], p, (0.86272605091218779, 64826, 10309, 14, 14)) assert (100, ) == model.train_score_.shape with pytest.raises( ValueError, match="Number of features of the model must match the input. " "Model n_features is 14 and input n_features is 2 "): model.predict(whas500_data.x[:, :2])
def test_fit_subsample(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(n_estimators=50, max_features=8, subsample=0.6, presort=False, random_state=0) model.fit(whas500_data.x, whas500_data.y) assert model.max_features_ == 8 assert hasattr(model, "oob_improvement_") incl_mask = numpy.ones(whas500_data.x.shape[0], dtype=bool) incl_mask[[35, 111, 174, 206, 236, 268, 497]] = False x_test = whas500_data.x[incl_mask] y_test = whas500_data.y[incl_mask] p = model.predict(x_test) assert_cindex_almost_equal(y_test['fstat'], y_test['lenfol'], p, (0.8330510326740247, 60985, 12221, 2, 110)) assert (50,) == model.train_score_.shape assert (50,) == model.oob_improvement_.shape with pytest.raises(ValueError, match="Number of features of the model must match the input. " "Model n_features is 14 and input n_features is 2 "): model.predict(whas500_data.x[:, :2])
def test_fit_int_param_as_float(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) if _sklearn_version_under_0p21: max_depth = 3 else: # Account for https://github.com/scikit-learn/scikit-learn/pull/12344 max_depth = 4 model = GradientBoostingSurvivalAnalysis(n_estimators=100.0, max_depth=float(max_depth), min_samples_split=10.0, random_state=0) params = model.get_params() assert 100 == params["n_estimators"] assert max_depth == params["max_depth"] assert 10 == params["min_samples_split"] model.set_params(max_leaf_nodes=15.0) assert 15 == model.get_params()["max_leaf_nodes"] model.fit(whas500_data.x, whas500_data.y) p = model.predict(whas500_data.x) assert_cindex_almost_equal(whas500_data.y['fstat'], whas500_data.y['lenfol'], p, (0.90256690042449006, 67826, 7321, 2, 14))
def test_fit(self): model = GradientBoostingSurvivalAnalysis(n_estimators=100, max_depth=3, min_samples_split=10, random_state=0) model.fit(self.x, self.y) self.assertEquals(model.max_features_, 14) self.assertFalse(hasattr(model, "oob_improvement_")) p = model.predict(self.x) expected_cindex = numpy.array( [0.86272605091218779, 64826, 10309, 14, 119]) result = concordance_index_censored(self.y['fstat'], self.y['lenfol'], p) assert_array_almost_equal(expected_cindex, numpy.array(result)) self.assertTupleEqual((100, ), model.train_score_.shape) self.assertRaisesRegex( ValueError, "Number of features of the model must match the input. " "Model n_features is 14 and input n_features is 2 ", model.predict, self.x[:, :2])
def test_presort(make_whas500, presort): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(n_estimators=10, presort=presort, random_state=0) with pytest.deprecated_call(match="The parameter 'presort' is deprecated "): model.fit(whas500_data.x, whas500_data.y)
def test_presort(whas500_sparse_data): model = GradientBoostingSurvivalAnalysis(n_estimators=10, presort=True, random_state=0) with pytest.raises( ValueError, match="Presorting is not supported for sparse matrices."): model.fit(whas500_sparse_data.x_sparse, whas500_sparse_data.y)
def test_negative_ccp_alpha(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) clf = GradientBoostingSurvivalAnalysis() msg = "ccp_alpha must be greater than or equal to 0" with pytest.raises(ValueError, match=msg): clf.set_params(ccp_alpha=-1.0) clf.fit(whas500_data.x, whas500_data.y)
def fit_and_score_features(X, y): n_features = X.shape[1] scores = np.empty(n_features) m = GradientBoostingSurvivalAnalysis(verbose=True, n_estimators=500) for j in range(n_features): Xj = X[:, j:j + 1] m.fit(Xj, y) scores[j] = m.score(Xj, y) return scores
def test_monitor_early_stopping(self): est = GradientBoostingSurvivalAnalysis(loss="ipcwls", n_estimators=50, max_depth=1, subsample=0.5, random_state=0) est.fit(self.x, self.y, monitor=early_stopping_monitor) self.assertEqual(est.n_estimators, 50) # this is not altered self.assertEqual(est.estimators_.shape[0], 10) self.assertEqual(est.train_score_.shape[0], 10) self.assertEqual(est.oob_improvement_.shape[0], 10)
def test_monitor_early_stopping(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) est = GradientBoostingSurvivalAnalysis(loss="ipcwls", n_estimators=50, max_depth=1, subsample=0.5, random_state=0) est.fit(whas500_data.x, whas500_data.y, monitor=early_stopping_monitor) assert est.n_estimators == 50 # this is not altered assert est.estimators_.shape[0] == 10 assert est.train_score_.shape[0] == 10 assert est.oob_improvement_.shape[0] == 10
def test_ipcwls_loss(self): model = GradientBoostingSurvivalAnalysis(loss="ipcwls", n_estimators=100, max_depth=3, random_state=0) model.fit(self.x, self.y) time_predicted = model.predict(self.x) time_true = self.y["lenfol"] event_true = self.y["fstat"] rmse_all = numpy.sqrt(mean_squared_error(time_true, time_predicted)) self.assertAlmostEqual(rmse_all, 590.5441693629117) rmse_uncensored = numpy.sqrt(mean_squared_error(time_true[event_true], time_predicted[event_true])) self.assertAlmostEqual(rmse_uncensored, 392.97741487479743)
def test_squared_loss(self): model = GradientBoostingSurvivalAnalysis(loss="squared", n_estimators=100, max_depth=3, random_state=0) model.fit(self.x, self.y) time_predicted = model.predict(self.x) time_true = self.y["lenfol"] event_true = self.y["fstat"] rmse_all = numpy.sqrt(mean_squared_error(time_true, time_predicted)) self.assertAlmostEqual(rmse_all, 580.23345259002951) rmse_uncensored = numpy.sqrt(mean_squared_error(time_true[event_true], time_predicted[event_true])) self.assertAlmostEqual(rmse_uncensored, 383.10639243317951)
def test_fit_dropout(self): model = GradientBoostingSurvivalAnalysis(n_estimators=100, max_features=8, learning_rate=1.0, dropout_rate=0.03, random_state=0) model.fit(self.x, self.y) self.assertFalse(hasattr(model, "oob_improvement_")) self.assertEquals(model.max_features_, 8) p = model.predict(self.x) expected_cindex = numpy.array([0.9094333, 68343, 6806, 0, 119]) result = concordance_index_censored(self.y['fstat'], self.y['lenfol'], p) assert_array_almost_equal(expected_cindex, numpy.array(result))
def test_dropout(whas500_sparse_data, loss): model = GradientBoostingSurvivalAnalysis(loss=loss, n_estimators=100, max_depth=1, min_samples_split=10, dropout_rate=0.03, random_state=0) model.fit(whas500_sparse_data.x_sparse, whas500_sparse_data.y) assert model.estimators_.shape[0] == 100 assert model.train_score_.shape == (100,) sparse_predict = model.predict(whas500_sparse_data.x_dense) model.fit(whas500_sparse_data.x_dense, whas500_sparse_data.y) dense_predict = model.predict(whas500_sparse_data.x_dense) assert_array_almost_equal(sparse_predict, dense_predict)
def test_fit_dropout(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(n_estimators=100, max_features=8, learning_rate=1.0, dropout_rate=0.03, random_state=0) model.fit(whas500_data.x, whas500_data.y) assert not hasattr(model, "oob_improvement_") assert model.max_features_ == 8 p = model.predict(whas500_data.x) assert_cindex_almost_equal(whas500_data.y['fstat'], whas500_data.y['lenfol'], p, (0.9094333, 68343, 6806, 0, 119))
def test_squared_loss(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(loss="squared", n_estimators=100, max_depth=3, random_state=0) model.fit(whas500_data.x, whas500_data.y) time_predicted = model.predict(whas500_data.x) time_true = whas500_data.y["lenfol"] event_true = whas500_data.y["fstat"] rmse_all = numpy.sqrt(mean_squared_error(time_true, time_predicted)) assert round(abs(rmse_all - 580.23345259002951), 7) == 0 rmse_uncensored = numpy.sqrt(mean_squared_error(time_true[event_true], time_predicted[event_true])) assert round(abs(rmse_uncensored - 383.10639243317951), 7) == 0
def test_ipcwls_loss(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(loss="ipcwls", n_estimators=100, max_depth=3, random_state=0) model.fit(whas500_data.x, whas500_data.y) time_predicted = model.predict(whas500_data.x) time_true = whas500_data.y["lenfol"] event_true = whas500_data.y["fstat"] rmse_all = numpy.sqrt(mean_squared_error(time_true, time_predicted)) assert round(abs(rmse_all - 590.5441693629117), 7) == 0 rmse_uncensored = numpy.sqrt(mean_squared_error(time_true[event_true], time_predicted[event_true])) assert round(abs(rmse_uncensored - 392.97741487479743), 7) == 0
def test_dropout(self): for loss in ('coxph', 'squared', 'ipcwls'): model = GradientBoostingSurvivalAnalysis(loss=loss, n_estimators=100, max_depth=1, min_samples_split=10, dropout_rate=0.03, random_state=0) model.fit(self.x_sparse, self.y) self.assertEqual(model.estimators_.shape[0], 100) self.assertTupleEqual(model.train_score_.shape, (100,)) sparse_predict = model.predict(self.x_dense) model.fit(self.x_dense, self.y) dense_predict = model.predict(self.x_dense) assert_array_almost_equal(sparse_predict, dense_predict)
def test_fit_int_param_as_float(self): model = GradientBoostingSurvivalAnalysis(n_estimators=100.0, max_depth=3.0, min_samples_split=10.0, random_state=0) params = model.get_params() self.assertEqual(100, params["n_estimators"]) self.assertEqual(3, params["max_depth"]) self.assertEqual(10, params["min_samples_split"]) model.set_params(max_leaf_nodes=15.0) self.assertEqual(15, model.get_params()["max_leaf_nodes"]) model.fit(self.x, self.y) p = model.predict(self.x) expected_cindex = numpy.array([0.90256690042449006, 67826, 7321, 2, 119]) result = concordance_index_censored(self.y['fstat'], self.y['lenfol'], p) assert_array_almost_equal(expected_cindex, numpy.array(result))
def test_fit_int_param_as_float(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(n_estimators=100.0, max_depth=3.0, min_samples_split=10.0, random_state=0) params = model.get_params() assert 100 == params["n_estimators"] assert 3 == params["max_depth"] assert 10 == params["min_samples_split"] model.set_params(max_leaf_nodes=15.0) assert 15 == model.get_params()["max_leaf_nodes"] model.fit(whas500_data.x, whas500_data.y) p = model.predict(whas500_data.x) assert_cindex_almost_equal(whas500_data.y['fstat'], whas500_data.y['lenfol'], p, (0.90256690042449006, 67826, 7321, 2, 119))
def test_predict_function(make_whas500, fn, expected_file): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(n_estimators=100, max_depth=2, random_state=0) train_x, train_y = whas500_data.x[10:], whas500_data.y[10:] model.fit(train_x, train_y) test_x = whas500_data.x[:10] surv_fn = getattr(model, fn)(test_x) times = numpy.unique(train_y["lenfol"][train_y["fstat"]]) actual = numpy.row_stack([fn_gb(times) for fn_gb in surv_fn]) expected = numpy.loadtxt(expected_file, delimiter=",") assert_array_almost_equal(actual, expected)
def test_ccp_alpha(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) est_full = GradientBoostingSurvivalAnalysis(n_estimators=10, max_leaf_nodes=20, random_state=1) est_full.fit(whas500_data.x, whas500_data.y) est_pruned = GradientBoostingSurvivalAnalysis(n_estimators=10, max_leaf_nodes=20, ccp_alpha=10.0, random_state=1) est_pruned.fit(whas500_data.x, whas500_data.y) tree = est_full.estimators_[0, 0].tree_ subtree = est_pruned.estimators_[0, 0].tree_ assert tree.node_count > subtree.node_count assert tree.max_depth > subtree.max_depth
def test_max_features(self): model = GradientBoostingSurvivalAnalysis(n_estimators=10, max_features="auto", max_depth=3, random_state=0) model.fit(self.x, self.y) self.assertEqual(model.max_features_, self.x.shape[1]) model.set_params(max_features="sqrt") model.fit(self.x, self.y) self.assertAlmostEqual(model.max_features_, int(numpy.sqrt(self.x.shape[1]))) model.set_params(max_features="log2") model.fit(self.x, self.y) self.assertAlmostEqual(model.max_features_, int(numpy.log2(self.x.shape[1]))) model.set_params(max_features=0.25) model.fit(self.x, self.y) self.assertAlmostEqual(model.max_features_, int(0.25 * self.x.shape[1])) model.set_params(max_features=5) model.fit(self.x, self.y) self.assertAlmostEqual(model.max_features_, 5) model.set_params(max_features=-1) self.assertRaisesRegex(ValueError, "max_features must be in \(0, n_features\]", model.fit, self.x, self.y) model.set_params(max_features=-1.125) self.assertRaisesRegex(ValueError, "max_features must be in \(0, 1.0\]", model.fit, self.x, self.y) model.set_params(max_features="fail_me") self.assertRaisesRegex( ValueError, "Invalid value for max_features: 'fail_me'. " "Allowed string values are 'auto', 'sqrt' " "or 'log2'", model.fit, self.x, self.y)
def test_fit_subsample(self): model = GradientBoostingSurvivalAnalysis(n_estimators=100, max_features=8, subsample=0.6, random_state=0) model.fit(self.x, self.y) self.assertEquals(model.max_features_, 8) self.assertTrue(hasattr(model, "oob_improvement_")) p = model.predict(self.x) expected_cindex = numpy.array([0.8610760, 64709, 10440, 0, 119]) result = concordance_index_censored(self.y['fstat'], self.y['lenfol'], p) assert_array_almost_equal(expected_cindex, numpy.array(result)) self.assertTupleEqual((100,), model.train_score_.shape) self.assertTupleEqual((100,), model.oob_improvement_.shape) self.assertRaisesRegex(ValueError, "Number of features of the model must match the input. " "Model n_features is 14 and input n_features is 2 ", model.predict, self.x[:, :2])