def test_squared_loss_staged_predict(self): # Test whether staged decision function eventually gives # the same prediction. model = GradientBoostingSurvivalAnalysis(loss="squared", n_estimators=100, max_depth=3, random_state=0) model.fit(self.x, self.y) y_pred = model.predict(self.x) # test if prediction for last stage equals ``predict`` for y in model.staged_predict(self.x): self.assertTupleEqual(y.shape, y_pred.shape) assert_array_equal(y_pred, y) model.set_params(dropout_rate=0.03) model.fit(self.x, self.y) y_pred = model.predict(self.x) # test if prediction for last stage equals ``predict`` for y in model.staged_predict(self.x): self.assertTupleEqual(y.shape, y_pred.shape) assert_array_equal(y_pred, y)
def test_squared_loss_staged_predict(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) # Test whether staged decision function eventually gives # the same prediction. model = GradientBoostingSurvivalAnalysis(loss="squared", n_estimators=100, max_depth=3, random_state=0) model.fit(whas500_data.x, whas500_data.y) y_pred = model.predict(whas500_data.x) # test if prediction for last stage equals ``predict`` for y in model.staged_predict(whas500_data.x): assert y.shape == y_pred.shape assert_array_equal(y_pred, y) model.set_params(dropout_rate=0.03) model.fit(whas500_data.x, whas500_data.y) y_pred = model.predict(whas500_data.x) # test if prediction for last stage equals ``predict`` for y in model.staged_predict(whas500_data.x): assert y.shape == y_pred.shape assert_array_equal(y_pred, y)
def test_fit(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(n_estimators=100, max_depth=3, min_samples_split=10, random_state=0) model.fit(whas500_data.x, whas500_data.y) assert model.max_features_ == 14 assert not hasattr(model, "oob_improvement_") p = model.predict(whas500_data.x) assert_cindex_almost_equal(whas500_data.y['fstat'], whas500_data.y['lenfol'], p, (0.86272605091218779, 64826, 10309, 14, 14)) assert (100, ) == model.train_score_.shape with pytest.raises( ValueError, match="Number of features of the model must match the input. " "Model n_features is 14 and input n_features is 2 "): model.predict(whas500_data.x[:, :2])
def test_fit_subsample(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(n_estimators=50, max_features=8, subsample=0.6, presort=False, random_state=0) model.fit(whas500_data.x, whas500_data.y) assert model.max_features_ == 8 assert hasattr(model, "oob_improvement_") incl_mask = numpy.ones(whas500_data.x.shape[0], dtype=bool) incl_mask[[35, 111, 174, 206, 236, 268, 497]] = False x_test = whas500_data.x[incl_mask] y_test = whas500_data.y[incl_mask] p = model.predict(x_test) assert_cindex_almost_equal(y_test['fstat'], y_test['lenfol'], p, (0.8330510326740247, 60985, 12221, 2, 110)) assert (50,) == model.train_score_.shape assert (50,) == model.oob_improvement_.shape with pytest.raises(ValueError, match="Number of features of the model must match the input. " "Model n_features is 14 and input n_features is 2 "): model.predict(whas500_data.x[:, :2])
def test_dropout(whas500_sparse_data, loss): model = GradientBoostingSurvivalAnalysis(loss=loss, n_estimators=100, max_depth=1, min_samples_split=10, dropout_rate=0.03, random_state=0) model.fit(whas500_sparse_data.x_sparse, whas500_sparse_data.y) assert model.estimators_.shape[0] == 100 assert model.train_score_.shape == (100,) sparse_predict = model.predict(whas500_sparse_data.x_dense) model.fit(whas500_sparse_data.x_dense, whas500_sparse_data.y) dense_predict = model.predict(whas500_sparse_data.x_dense) assert_array_almost_equal(sparse_predict, dense_predict)
def test_dropout(self): for loss in ('coxph', 'squared', 'ipcwls'): model = GradientBoostingSurvivalAnalysis(loss=loss, n_estimators=100, max_depth=1, min_samples_split=10, dropout_rate=0.03, random_state=0) model.fit(self.x_sparse, self.y) self.assertEqual(model.estimators_.shape[0], 100) self.assertTupleEqual(model.train_score_.shape, (100,)) sparse_predict = model.predict(self.x_dense) model.fit(self.x_dense, self.y) dense_predict = model.predict(self.x_dense) assert_array_almost_equal(sparse_predict, dense_predict)
def test_fit_subsample(self): model = GradientBoostingSurvivalAnalysis(n_estimators=100, max_features=8, subsample=0.6, random_state=0) model.fit(self.x, self.y) self.assertEquals(model.max_features_, 8) self.assertTrue(hasattr(model, "oob_improvement_")) incl_mask = numpy.ones(self.x.shape[0], dtype=bool) incl_mask[[35, 111, 174, 206, 236, 268, 497]] = False x_test = self.x[incl_mask] y_test = self.y[incl_mask] p = model.predict(x_test) expected_cindex = numpy.array([0.8592640, 62905, 10303, 0, 110]) result = concordance_index_censored(y_test['fstat'], y_test['lenfol'], p) assert_array_almost_equal(expected_cindex, numpy.array(result)) self.assertTupleEqual((100, ), model.train_score_.shape) self.assertTupleEqual((100, ), model.oob_improvement_.shape) self.assertRaisesRegex( ValueError, "Number of features of the model must match the input. " "Model n_features is 14 and input n_features is 2 ", model.predict, self.x[:, :2])
def test_fit(self): model = GradientBoostingSurvivalAnalysis(n_estimators=100, max_depth=3, min_samples_split=10, random_state=0) model.fit(self.x, self.y) self.assertEquals(model.max_features_, 14) self.assertFalse(hasattr(model, "oob_improvement_")) p = model.predict(self.x) expected_cindex = numpy.array( [0.86272605091218779, 64826, 10309, 14, 119]) result = concordance_index_censored(self.y['fstat'], self.y['lenfol'], p) assert_array_almost_equal(expected_cindex, numpy.array(result)) self.assertTupleEqual((100, ), model.train_score_.shape) self.assertRaisesRegex( ValueError, "Number of features of the model must match the input. " "Model n_features is 14 and input n_features is 2 ", model.predict, self.x[:, :2])
def test_fit_int_param_as_float(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) if _sklearn_version_under_0p21: max_depth = 3 else: # Account for https://github.com/scikit-learn/scikit-learn/pull/12344 max_depth = 4 model = GradientBoostingSurvivalAnalysis(n_estimators=100.0, max_depth=float(max_depth), min_samples_split=10.0, random_state=0) params = model.get_params() assert 100 == params["n_estimators"] assert max_depth == params["max_depth"] assert 10 == params["min_samples_split"] model.set_params(max_leaf_nodes=15.0) assert 15 == model.get_params()["max_leaf_nodes"] model.fit(whas500_data.x, whas500_data.y) p = model.predict(whas500_data.x) assert_cindex_almost_equal(whas500_data.y['fstat'], whas500_data.y['lenfol'], p, (0.90256690042449006, 67826, 7321, 2, 14))
def test_squared_loss(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(loss="squared", n_estimators=100, max_depth=3, random_state=0) model.fit(whas500_data.x, whas500_data.y) time_predicted = model.predict(whas500_data.x) time_true = whas500_data.y["lenfol"] event_true = whas500_data.y["fstat"] rmse_all = numpy.sqrt(mean_squared_error(time_true, time_predicted)) assert round(abs(rmse_all - 580.23345259002951), 7) == 0 rmse_uncensored = numpy.sqrt( mean_squared_error(time_true[event_true], time_predicted[event_true])) assert round(abs(rmse_uncensored - 383.10639243317951), 7) == 0 cindex = model.score(whas500_data.x, whas500_data.y) assert round(abs(cindex - 0.9021810004), 7) == 0 with pytest.raises( ValueError, match="`fit` must be called with the loss option set to 'coxph'" ): model.predict_survival_function(whas500_data.x) with pytest.raises( ValueError, match="`fit` must be called with the loss option set to 'coxph'" ): model.predict_cumulative_hazard_function(whas500_data.x)
def test_squared_loss(self): model = GradientBoostingSurvivalAnalysis(loss="squared", n_estimators=100, max_depth=3, random_state=0) model.fit(self.x, self.y) time_predicted = model.predict(self.x) time_true = self.y["lenfol"] event_true = self.y["fstat"] rmse_all = numpy.sqrt(mean_squared_error(time_true, time_predicted)) self.assertAlmostEqual(rmse_all, 580.23345259002951) rmse_uncensored = numpy.sqrt(mean_squared_error(time_true[event_true], time_predicted[event_true])) self.assertAlmostEqual(rmse_uncensored, 383.10639243317951)
def test_ipcwls_loss(self): model = GradientBoostingSurvivalAnalysis(loss="ipcwls", n_estimators=100, max_depth=3, random_state=0) model.fit(self.x, self.y) time_predicted = model.predict(self.x) time_true = self.y["lenfol"] event_true = self.y["fstat"] rmse_all = numpy.sqrt(mean_squared_error(time_true, time_predicted)) self.assertAlmostEqual(rmse_all, 590.5441693629117) rmse_uncensored = numpy.sqrt(mean_squared_error(time_true[event_true], time_predicted[event_true])) self.assertAlmostEqual(rmse_uncensored, 392.97741487479743)
def test_fit_dropout(self): model = GradientBoostingSurvivalAnalysis(n_estimators=100, max_features=8, learning_rate=1.0, dropout_rate=0.03, random_state=0) model.fit(self.x, self.y) self.assertFalse(hasattr(model, "oob_improvement_")) self.assertEquals(model.max_features_, 8) p = model.predict(self.x) expected_cindex = numpy.array([0.9094333, 68343, 6806, 0, 119]) result = concordance_index_censored(self.y['fstat'], self.y['lenfol'], p) assert_array_almost_equal(expected_cindex, numpy.array(result))
def test_fit_dropout(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(n_estimators=100, max_features=8, learning_rate=1.0, dropout_rate=0.03, random_state=0) model.fit(whas500_data.x, whas500_data.y) assert not hasattr(model, "oob_improvement_") assert model.max_features_ == 8 p = model.predict(whas500_data.x) assert_cindex_almost_equal(whas500_data.y['fstat'], whas500_data.y['lenfol'], p, (0.9094333, 68343, 6806, 0, 119))
def test_squared_loss(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(loss="squared", n_estimators=100, max_depth=3, random_state=0) model.fit(whas500_data.x, whas500_data.y) time_predicted = model.predict(whas500_data.x) time_true = whas500_data.y["lenfol"] event_true = whas500_data.y["fstat"] rmse_all = numpy.sqrt(mean_squared_error(time_true, time_predicted)) assert round(abs(rmse_all - 580.23345259002951), 7) == 0 rmse_uncensored = numpy.sqrt(mean_squared_error(time_true[event_true], time_predicted[event_true])) assert round(abs(rmse_uncensored - 383.10639243317951), 7) == 0
def test_ipcwls_loss(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(loss="ipcwls", n_estimators=100, max_depth=3, random_state=0) model.fit(whas500_data.x, whas500_data.y) time_predicted = model.predict(whas500_data.x) time_true = whas500_data.y["lenfol"] event_true = whas500_data.y["fstat"] rmse_all = numpy.sqrt(mean_squared_error(time_true, time_predicted)) assert round(abs(rmse_all - 590.5441693629117), 7) == 0 rmse_uncensored = numpy.sqrt(mean_squared_error(time_true[event_true], time_predicted[event_true])) assert round(abs(rmse_uncensored - 392.97741487479743), 7) == 0
def test_fit_int_param_as_float(self): model = GradientBoostingSurvivalAnalysis(n_estimators=100.0, max_depth=3.0, min_samples_split=10.0, random_state=0) params = model.get_params() self.assertEqual(100, params["n_estimators"]) self.assertEqual(3, params["max_depth"]) self.assertEqual(10, params["min_samples_split"]) model.set_params(max_leaf_nodes=15.0) self.assertEqual(15, model.get_params()["max_leaf_nodes"]) model.fit(self.x, self.y) p = model.predict(self.x) expected_cindex = numpy.array([0.90256690042449006, 67826, 7321, 2, 119]) result = concordance_index_censored(self.y['fstat'], self.y['lenfol'], p) assert_array_almost_equal(expected_cindex, numpy.array(result))
def test_fit_int_param_as_float(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(n_estimators=100.0, max_depth=3.0, min_samples_split=10.0, random_state=0) params = model.get_params() assert 100 == params["n_estimators"] assert 3 == params["max_depth"] assert 10 == params["min_samples_split"] model.set_params(max_leaf_nodes=15.0) assert 15 == model.get_params()["max_leaf_nodes"] model.fit(whas500_data.x, whas500_data.y) p = model.predict(whas500_data.x) assert_cindex_almost_equal(whas500_data.y['fstat'], whas500_data.y['lenfol'], p, (0.90256690042449006, 67826, 7321, 2, 119))
def test_fit_subsample(self): model = GradientBoostingSurvivalAnalysis(n_estimators=100, max_features=8, subsample=0.6, random_state=0) model.fit(self.x, self.y) self.assertEquals(model.max_features_, 8) self.assertTrue(hasattr(model, "oob_improvement_")) p = model.predict(self.x) expected_cindex = numpy.array([0.8610760, 64709, 10440, 0, 119]) result = concordance_index_censored(self.y['fstat'], self.y['lenfol'], p) assert_array_almost_equal(expected_cindex, numpy.array(result)) self.assertTupleEqual((100,), model.train_score_.shape) self.assertTupleEqual((100,), model.oob_improvement_.shape) self.assertRaisesRegex(ValueError, "Number of features of the model must match the input. " "Model n_features is 14 and input n_features is 2 ", model.predict, self.x[:, :2])
def train_gbmsurv(population = None, plpData= None, train = True, modelOutput =None, loss='coxph', learning_rate=0.1, n_estimators=100, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_split=None, min_impurity_decrease=0.0, max_features=None, max_leaf_nodes=None, subsample=1.0, dropout_rate=0.0, verbose=0, seed = 1, quiet = True): print("Training python scikit-survial GradientBoostingSurvivalAnalysis model" ) ytype=np.dtype([('outcome', '?'), ('surv', 'i')]) y=np.empty(len(population[:,1]),dtype=ytype) y['outcome']= population[:,1]>0 y['surv']= population[:,2] X = plpData[population[:,0],:] trainInds =population[:,population.shape[1]-1] >0 print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1])) print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1])) ########################################################################### if train: pred_size = int(np.sum(population[:,population.shape[1]-1] > 0)) print("Calculating prediction for train set of size %s" %(pred_size)) test_pred = np.zeros(pred_size)# zeros length sum(population[:,population.size[1]] ==i) for i in range(1, int(np.max(population[:,population.shape[1]-1])+1), 1): testInd =population[population[:,population.shape[1]-1] > 0,population.shape[1]-1] ==i trainInd = (population[population[:,population.shape[1]-1] > 0,population.shape[1]-1] !=i) train_x = X[trainInds,:][trainInd,:] train_y = y[trainInds,][trainInd,] test_x = X[trainInds,:][testInd,:] print("Fold %s split %s in train set and %s in test set" %(i, train_x.shape[0], test_x.shape[0])) print("Train set contains %s outcomes " %(np.sum(train_y['outcome']))) print("Training fold %s" %(i)) start_time = timeit.default_timer() gbmsurv = GradientBoostingSurvivalAnalysis(loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, criterion=criterion, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_depth=max_depth, min_impurity_split=min_impurity_split, min_impurity_decrease=min_impurity_decrease, random_state=seed, max_features=max_features, max_leaf_nodes=max_leaf_nodes, subsample=subsample, dropout_rate=dropout_rate, verbose=verbose) gbmsurv = gbmsurv.fit(X=train_x, y=train_y) end_time = timeit.default_timer() print("Training fold took: %.2f s" %(end_time-start_time)) print("Calculating predictions on left out fold set...") ind = (population[:,population.shape[1]-1] > 0) ind = population[ind,population.shape[1]-1]==i rowCount = np.sum(ind) temp_pred = gbmsurv.predict(test_x.toarray()) temp_pred = temp_pred.flatten() temp_pred = temp_pred[0:(rowCount)] test_pred[ind] = temp_pred print("Prediction complete: %s rows " %(np.shape(test_pred[ind])[0])) print("Mean: %s prediction value" %(np.mean(test_pred[ind]))) # merge pred with indexes[testInd,:] test_pred.shape = (population[population[:,population.shape[1]-1] > 0,:].shape[0], 1) prediction = np.append(population[population[:,population.shape[1]-1] > 0,:],test_pred, axis=1) return prediction; # train final: else: print("Training final python scikit-survial GradientBoostingSurvivalAnalysis model on all train data...") print("X- %s rows and Y %s length" %(X[trainInds,:].shape[0], y[trainInds].shape[0])) start_time = timeit.default_timer() gbmsurv = GradientBoostingSurvivalAnalysis(loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, criterion=criterion, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_depth=max_depth, min_impurity_split=min_impurity_split, min_impurity_decrease=min_impurity_decrease, random_state=seed, max_features=max_features, max_leaf_nodes=max_leaf_nodes, subsample=subsample, dropout_rate=dropout_rate, verbose=verbose) gbmsurv = gbmsurv.fit(X[trainInds,:], y[trainInds]) end_time = timeit.default_timer() print("Training final took: %.2f s" %(end_time-start_time)) # save the model: if not os.path.exists(modelOutput): os.makedirs(modelOutput) print("Model saved to: %s" %(modelOutput) ) joblib.dump(gbmsurv, os.path.join(modelOutput,"model.pkl")) pred = gbmsurv.predict(X[trainInds,:].toarray()) pred = pred.flatten() rowCount = np.sum(trainInds) pred = pred[0:(rowCount)] pred.shape = (population[population[:,population.shape[1]-1] > 0,:].shape[0], 1) prediction = np.append(population[population[:,population.shape[1]-1] > 0,:],pred, axis=1) return prediction, gbmsurv.feature_importances_;