n_estimators=args.n_est, learning_rate=args.lr, natural_gradient=args.natural, verbose=args.verbose, minibatch_frac=1.0, Base=base_name_to_learner[args.base], Score=eval(args.score)()) train_losses = ngb.fit(X_train, Y_train) #, X_val, Y_val) forecast = ngb.pred_dist(X_test) train_forecast = ngb.pred_dist(X_train) print('NGB score: %.4f (val), %.4f (train)' % (concordance_index_censored(Y_test['Event'], Y_test['Time'], -forecast.mean())[0], concordance_index_censored(Y_train['Event'], Y_train['Time'], -train_forecast.mean())[0])) #logger.tick(forecast, Y_test) ## ## sksurv ## gbsa = GBSA(n_estimators=args.n_est, learning_rate=args.lr, subsample=args.minibatch_frac, verbose=args.verbose) gbsa.fit(X_train, Y_train) print('GBSA score: %.4f (val), %.4f (train)' % (gbsa.score(X_test, Y_test), gbsa.score(X_train, Y_train))) #logger.save()
def test_max_features(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(n_estimators=10, max_features="auto", max_depth=3, random_state=0) model.fit(whas500_data.x, whas500_data.y) assert model.max_features_ == whas500_data.x.shape[1] model.set_params(max_features="sqrt") model.fit(whas500_data.x, whas500_data.y) assert round( abs(model.max_features_ - int(numpy.sqrt(whas500_data.x.shape[1]))), 7) == 0 model.set_params(max_features="log2") model.fit(whas500_data.x, whas500_data.y) assert round( abs(model.max_features_ - int(numpy.log2(whas500_data.x.shape[1]))), 7) == 0 model.set_params(max_features=0.25) model.fit(whas500_data.x, whas500_data.y) assert round( abs(model.max_features_ - int(0.25 * whas500_data.x.shape[1])), 7) == 0 model.set_params(max_features=5) model.fit(whas500_data.x, whas500_data.y) assert round(abs(model.max_features_ - 5), 7) == 0 model.set_params(max_features=-1) with pytest.raises(ValueError, match=r"max_features must be in \(0, n_features\]"): model.fit(whas500_data.x, whas500_data.y) model.set_params(max_features=-1.125) with pytest.raises(ValueError, match=r"max_features must be in \(0, 1.0\]"): model.fit(whas500_data.x, whas500_data.y) model.set_params(max_features="fail_me") with pytest.raises(ValueError, match="Invalid value for max_features: 'fail_me'. " "Allowed string values are 'auto', 'sqrt' " "or 'log2'"): model.fit(whas500_data.x, whas500_data.y)
E = df['LapseIndicator'].apply(lambda x: True if x == 1 else False) df2['E'] = E df2['T'] = T X, y = get_x_y(df2, ['E', 'T'], pos_label=True) for c in X.columns.values: if c != 'AGE AT DOC': X[c] = X[c].astype('category') data_x_numeric = OneHotEncoder().fit_transform(X) #%% estimator = GradientBoostingSurvivalAnalysis(verbose=True, n_estimators=500) estimator.fit(data_x_numeric, y) print(estimator.score(data_x_numeric, y)) print() scores = fit_and_score_features(data_x_numeric.values, y) print( pd.Series(scores, index=data_x_numeric.columns).sort_values(ascending=False)) pickle.dump(estimator, open('GradientRegressor.pkl', 'wb')) #%% from sklearn.feature_selection import SelectKBest
def test_squared_loss_staged_predict(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) # Test whether staged decision function eventually gives # the same prediction. model = GradientBoostingSurvivalAnalysis(loss="squared", n_estimators=100, max_depth=3, random_state=0) model.fit(whas500_data.x, whas500_data.y) y_pred = model.predict(whas500_data.x) # test if prediction for last stage equals ``predict`` for y in model.staged_predict(whas500_data.x): assert y.shape == y_pred.shape assert_array_equal(y_pred, y) model.set_params(dropout_rate=0.03) model.fit(whas500_data.x, whas500_data.y) y_pred = model.predict(whas500_data.x) # test if prediction for last stage equals ``predict`` for y in model.staged_predict(whas500_data.x): assert y.shape == y_pred.shape assert_array_equal(y_pred, y)
def test_squared_loss_staged_predict(self): # Test whether staged decision function eventually gives # the same prediction. model = GradientBoostingSurvivalAnalysis(loss="squared", n_estimators=100, max_depth=3, random_state=0) model.fit(self.x, self.y) y_pred = model.predict(self.x) # test if prediction for last stage equals ``predict`` for y in model.staged_predict(self.x): self.assertTupleEqual(y.shape, y_pred.shape) assert_array_equal(y_pred, y) model.set_params(dropout_rate=0.03) model.fit(self.x, self.y) y_pred = model.predict(self.x) # test if prediction for last stage equals ``predict`` for y in model.staged_predict(self.x): self.assertTupleEqual(y.shape, y_pred.shape) assert_array_equal(y_pred, y)
def test_fit_verbose(make_whas500): whas500_data = make_whas500(with_std=False, to_numeric=True) model = GradientBoostingSurvivalAnalysis(n_estimators=10, verbose=1, random_state=0) model.fit(whas500_data.x, whas500_data.y)
def test_fit_verbose(self): model = GradientBoostingSurvivalAnalysis(n_estimators=10, verbose=1, random_state=0) model.fit(self.x, self.y)
def test_max_features(self): model = GradientBoostingSurvivalAnalysis(n_estimators=10, max_features="auto", max_depth=3, random_state=0) model.fit(self.x, self.y) self.assertEqual(model.max_features_, self.x.shape[1]) model.set_params(max_features="sqrt") model.fit(self.x, self.y) self.assertAlmostEqual(model.max_features_, int(numpy.sqrt(self.x.shape[1]))) model.set_params(max_features="log2") model.fit(self.x, self.y) self.assertAlmostEqual(model.max_features_, int(numpy.log2(self.x.shape[1]))) model.set_params(max_features=0.25) model.fit(self.x, self.y) self.assertAlmostEqual(model.max_features_, int(0.25 * self.x.shape[1])) model.set_params(max_features=5) model.fit(self.x, self.y) self.assertAlmostEqual(model.max_features_, 5) model.set_params(max_features=-1) self.assertRaisesRegex(ValueError, r"max_features must be in \(0, n_features\]", model.fit, self.x, self.y) model.set_params(max_features=-1.125) self.assertRaisesRegex(ValueError, r"max_features must be in \(0, 1.0\]", model.fit, self.x, self.y) model.set_params(max_features="fail_me") self.assertRaisesRegex( ValueError, "Invalid value for max_features: 'fail_me'. " "Allowed string values are 'auto', 'sqrt' " "or 'log2'", model.fit, self.x, self.y)
ngb = NGBoost(Dist=eval(args.distn), n_estimators=args.n_est, learning_rate=args.lr, natural_gradient=args.natural, verbose=args.verbose, minibatch_frac=1.0, Base=base_name_to_learner[args.base], Score=eval(args.score)) train_losses = ngb.fit(X_train, Y_train) #, X_val, Y_val) forecast = ngb.pred_dist(X_test) train_forecast = ngb.pred_dist(X_train) print('NGB score: %.4f (val), %.4f (train)' % (concordance_index_censored(Y_test['Event'], Y_test['Time'], -forecast.mean())[0], concordance_index_censored(Y_train['Event'], Y_train['Time'], -train_forecast.mean())[0] )) #logger.tick(forecast, Y_test) ## ## sksurv ## gbsa = GBSA(n_estimators=args.n_est, learning_rate=args.lr, subsample=args.minibatch_frac, verbose=args.verbose) gbsa.fit(X_train, Y_train) print('GBSA score: %.4f (val), %.4f (train)' % (gbsa.score(X_test, Y_test), gbsa.score(X_train, Y_train))) #logger.save()
verbose=args.verbose, minibatch_frac=1.0, Base=base_name_to_learner[args.base], Score=eval(args.score), ) train_losses = ngb.fit(X_train, Y_train, E_train) forecast = ngb.pred_dist(X_test) train_forecast = ngb.pred_dist(X_train) print("NGB score: %.4f (val), %.4f (train)" % ( concordance_index_censored(E_test.astype(bool), Y_test, -forecast.mean())[0], concordance_index_censored(E_train.astype(bool), Y_train, -train_forecast.mean())[0], )) ## ## sksurv ## gbsa = GBSA( n_estimators=args.n_est, learning_rate=args.lr, subsample=args.minibatch_frac, verbose=args.verbose, ) gbsa.fit(X_train, Y_join(Y_train, E_train)) print("GBSA score: %.4f (val), %.4f (train)" % ( gbsa.score(X_test, Y_join(Y_test, E_test)), gbsa.score(X_train, Y_join(Y_train, E_train)), ))
def RandomGridSearchRFC_Fixed(X,Y,splits, model, survival): """ This function looks for the best set o parameters for RFC method Input: X: training set Y: labels of training set splits: cross validation splits, used to make sure the parameters are stable Output: clf.best_params_: dictionary with the parameters, to use: param_svm['kernel'] """ start_svm = time.time() if model == 'svm': clf = svm.SVC() tuned_parameters = { 'C': ([0.01, 1, 10]), 'kernel': (['rbf', 'linear']), # 'kernel': (['linear', 'rbf', 'sigmoid']), # 'degree': ([1,3,5,10]), # 'decision_function_shape' : (['ovo', 'ovr']), # 'cache_size': ([500,1000,1500,2000]), 'shrinking': ([False, True]), # 'probability': ([False, True]) } if model == 'cart': clf = tree.DecisionTreeClassifier() tuned_parameters = { 'criterion': (['gini', 'entropy']), 'max_depth': ([10,20]), 'min_samples_split': ([2,3,5]), 'min_samples_leaf': ([2,3,5]), } if model == 'rf': clf = ensemble.RandomForestClassifier() tuned_parameters = { 'n_estimators': ([200,500,1000]), # 'max_features': (['auto', 'sqrt', 'log2',1,4,8]), # precomputed,'poly', 'sigmoid' 'max_depth': ([10,20]), # 'criterion': (['gini', 'entropy']), 'min_samples_split': [2,3,5], 'min_samples_leaf': [2,3,5], } if model == 'xgboost': clf = XGBClassifier() tuned_parameters = { 'booster': (['gbtree']), 'max_depth': ([5,10,20]), 'reg_lambda': ([0,1]), 'reg_alpha': ([0,1]), 'subsample': ([0.5,1]) } if model == 'lr': clf = linear_model.LogisticRegression() tuned_parameters = { 'solver': (['liblinear', 'sag', 'saga']) } if model == 'cox': clf = CoxnetSurvivalAnalysis() tuned_parameters = { 'n_alphas': ([50,100,200]), 'l1_ratio': ([0.1,0.5,1]), } if model == 'survSVM': clf = FastSurvivalSVM() tuned_parameters = { 'alpha': ([0.5,1]), 'rank_ratio': ([0.5,1]), 'max_iter': ([20,40,80]), 'optimizer': (['rbtree', 'avltree']), } if model == 'gb': clf = GradientBoostingSurvivalAnalysis() tuned_parameters = { 'learning_rate': ([0.1, 0.3]), 'n_estimators': ([100,200,400]), 'max_depth': ([3,6,12]) } if survival == True: scorer = make_scorer(CI, greater_is_better=True) y_for_cv = np.array([t[0] for t in Y]) cv = StratifiedKFold(y_for_cv, n_folds=2) # x-validation else: cv = StratifiedKFold(Y, n_folds=2) # x-validation scores = ['roc_auc'] print (' ...performing x-validation') clf = GridSearchCV(clf, tuned_parameters, scoring='%s' % scores[0], cv=cv, verbose=10) #scoring='%s' % scores[0] # clf = BayesSearchCV(clf, tuned_parameters, n_iter=50, cv=splits, # optimizer_kwargs=dict(acq_func='LCB', base_estimator='RF')) clf.fit(X, Y) end_svm = time.time() print("Total time to process: ",end_svm - start_svm) return(clf.best_params_,clf)