def test_early_stopping_classification( data, scoring, validation_fraction, early_stopping, n_iter_no_change, tol ): max_iter = 50 X, y = data gb = HistGradientBoostingClassifier( verbose=1, # just for coverage min_samples_leaf=5, # easier to overfit fast scoring=scoring, tol=tol, early_stopping=early_stopping, validation_fraction=validation_fraction, max_iter=max_iter, n_iter_no_change=n_iter_no_change, random_state=0, ) gb.fit(X, y) if early_stopping is True: assert n_iter_no_change <= gb.n_iter_ < max_iter else: assert gb.n_iter_ == max_iter
def test_binning_train_validation_are_separated(): # Make sure training and validation data are binned separately. # See issue 13926 rng = np.random.RandomState(0) validation_fraction = .2 gb = HistGradientBoostingClassifier( n_iter_no_change=5, validation_fraction=validation_fraction, random_state=rng) gb.fit(X_classification, y_classification) mapper_training_data = gb.bin_mapper_ # Note that since the data is small there is no subsampling and the # random_state doesn't matter mapper_whole_data = _BinMapper(random_state=0) mapper_whole_data.fit(X_classification) n_samples = X_classification.shape[0] assert np.all( mapper_training_data.actual_n_bins_ == int((1 - validation_fraction) * n_samples)) assert np.all( mapper_training_data.actual_n_bins_ != mapper_whole_data.actual_n_bins_ )
def hist_gradient_boosting_classifier(x_train, y_train, x_test, y_test): from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier ensem = HistGradientBoostingClassifier() ensem.fit(x_train, y_train) value = ensem.score(x_test, y_test) return "{0:.2f}".format(value)
def test_invalid_classification_loss(): binary_clf = HistGradientBoostingClassifier(loss="binary_crossentropy") err_msg = ("loss='binary_crossentropy' is not defined for multiclass " "classification with n_classes=3, use " "loss='categorical_crossentropy' instead") with pytest.raises(ValueError, match=err_msg): binary_clf.fit(np.zeros(shape=(3, 2)), np.arange(3))
def test_invalid_classification_loss(): binary_clf = HistGradientBoostingClassifier(loss="binary_crossentropy") err_msg = ("loss='binary_crossentropy' is not defined for multiclass " "classification with n_classes=3, use " "loss='categorical_crossentropy' instead") with pytest.raises(ValueError, match=err_msg): binary_clf.fit(np.zeros(shape=(3, 2)), np.arange(3))
def test_zero_sample_weights_classification(): # Make sure setting a SW to zero amounts to ignoring the corresponding # sample X = [[1, 0], [1, 0], [1, 0], [0, 1]] y = [0, 0, 1, 0] # ignore the first 2 training samples by setting their weight to 0 sample_weight = [0, 0, 1, 1] gb = HistGradientBoostingClassifier(loss='binary_crossentropy', min_samples_leaf=1) gb.fit(X, y, sample_weight=sample_weight) assert_array_equal(gb.predict([[1, 0]]), [1]) X = [[1, 0], [1, 0], [1, 0], [0, 1], [1, 1]] y = [0, 0, 1, 0, 2] # ignore the first 2 training samples by setting their weight to 0 sample_weight = [0, 0, 1, 1, 1] gb = HistGradientBoostingClassifier(loss='categorical_crossentropy', min_samples_leaf=1) gb.fit(X, y, sample_weight=sample_weight) assert_array_equal(gb.predict([[1, 0]]), [1])
def test_same_predictions_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification pytest.importorskip("lightgbm") rng = np.random.RandomState(seed=seed) max_iter = 1 max_bins = 255 X, y = make_classification( n_samples=n_samples, n_classes=2, n_features=5, n_informative=5, n_redundant=0, random_state=0, ) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = HistGradientBoostingClassifier( loss="binary_crossentropy", max_iter=max_iter, max_bins=max_bins, learning_rate=1, early_stopping=False, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, ) est_lightgbm = get_equivalent_estimator(est_sklearn, lib="lightgbm") est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) assert np.mean(pred_sklearn == pred_lightgbm) > 0.89 acc_lightgbm = accuracy_score(y_train, pred_lightgbm) acc_sklearn = accuracy_score(y_train, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) assert np.mean(pred_sklearn == pred_lightgbm) > 0.89 acc_lightgbm = accuracy_score(y_test, pred_lightgbm) acc_sklearn = accuracy_score(y_test, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
def test_string_target_early_stopping(scoring): # Regression tests for #14709 where the targets need to be encoded before # to compute the score rng = np.random.RandomState(42) X = rng.randn(100, 10) y = np.array(["x"] * 50 + ["y"] * 50, dtype=object) gbrt = HistGradientBoostingClassifier(n_iter_no_change=10, scoring=scoring) gbrt.fit(X, y)
def histGradientModel(X_train,Y_train): # use Hist Gradient Boosting Classifier from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier histgrad=HistGradientBoostingClassifier() histgrad.fit(X_train,y_train) print('\nHist Gradient Boosting Training Score:',histgrad.score(X_train,Y_train)) return histgrad,histgrad.score(X_train,Y_train)
def common_test_model_hgb_classifier(self, add_nan=False, n_classes=2): model = HistGradientBoostingClassifier(max_iter=5, max_depth=2) X, y = make_classification(n_features=10, n_samples=1000, n_informative=4, n_classes=n_classes, random_state=42) if add_nan: rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3) cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3) X[rows, cols] = numpy.nan X_train, X_test, y_train, _ = train_test_split(X, y, test_size=0.5, random_state=42) model.fit(X_train, y_train) model_onnx = convert_sklearn( model, "unused", [("input", FloatTensorType([None, X.shape[1]]))]) self.assertIsNotNone(model_onnx) X_test = X_test.astype(numpy.float32)[:5] dump_data_and_model(X_test, model, model_onnx, basename="SklearnHGBClassifier%s%d" % ("nan" if add_nan else '', n_classes), verbose=False, allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.2') or " "StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.2.1')") if n_classes == 2: model_onnx = convert_sklearn( model, "unused", [("input", FloatTensorType([None, X.shape[1]]))], options={model.__class__: { 'raw_scores': True }}) self.assertIsNotNone(model_onnx) X_test = X_test.astype(numpy.float32)[:5] # There is a bug in onnxruntime <= 1.1.0. # Raw scores are always positive. dump_data_and_model( X_test, model, model_onnx, basename="SklearnHGBClassifierRaw%s%d" % ("nan" if add_nan else '', n_classes), verbose=False, allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.2') or " "StrictVersion(onnxruntime.__version__)" " < StrictVersion('1.2.0')", methods=['predict', 'decision_function_binary'])
def test_crossentropy_binary_problem(): # categorical_crossentropy should only be used if there are more than two # classes present. PR #14869 X = [[1], [0]] y = [0, 1] gbrt = HistGradientBoostingClassifier(loss='categorical_crossentropy') with pytest.raises(ValueError, match="'categorical_crossentropy' is not suitable for"): gbrt.fit(X, y)
def model_(X, y, rs): osp = RandomUnderSampler(random_state=rs) x_train_, y_train_ = osp.fit_sample(X, y) # # 基础模型 # clf = CatBoostClassifier(loss_function='Logloss', # logging_level='Silent', # cat_features=categorical_features_indices) clf = HistGradientBoostingClassifier(random_state=10) clf.fit(x_train_, y_train_) return clf
def k_fold_trainning(rawdata,n_folds=5): cv = StratifiedKFold(n_splits=n_folds,shuffle=True) target = np.array(rawdata[0].values) lure = np.array(rawdata[1].values) y = np.array(rawdata['label'].values) tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) fig, ax = plt.subplots() for i, (train, test) in enumerate(cv.split(target, lure, y)): print('----------------Training Fold %d---------------'%(i+1)) X_train = pd.DataFrame({0:target[train],1:lure[train]}) X_test = pd.DataFrame({0:target[test],1:lure[test]}) pmfm = create_pmfm(X_train,y[train]) train_feature = X_train.apply(feature_Encoding, axis = 1, args = (0,1,pmfm)).values test_feature = X_test.apply(feature_Encoding, axis = 1, args = (0,1,pmfm)).values clf = HistGradientBoostingClassifier(max_iter=500, learning_rate=0.9, max_depth=6, l2_regularization=100) train_data = np.matrix([train_feature[i] for i in range(train_feature.shape[0])]) test_data = np.matrix([test_feature[i] for i in range(test_feature.shape[0])]) clf.fit(train_data, y[train]) pred = clf.predict(test_data) evaluate(y[test], pred) viz = plot_roc_curve(clf, test_data, y[test], name='ROC fold {}'.format(i+1), alpha=0.5, lw=1, ax=ax) interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr) interp_tpr[0] = 0.0 tprs.append(interp_tpr) aucs.append(viz.roc_auc) ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8) mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) ax.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.3f $\pm$ %0.3f)' % (mean_auc, std_auc), lw=2, alpha=.8) std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic Curve") ax.legend(loc="lower right") plt.savefig('roc.png',dpi=300)
def train_model(rawdata, compute_importance=True): X,y = create_Input(rawdata) pmfm = create_pmfm(X,y) np.save("feature.npy",pmfm) feature = X.apply(feature_Encoding, axis = 1, args = (0,1,pmfm)).values clf = HistGradientBoostingClassifier(max_iter=500, learning_rate=0.9, max_depth=6, l2_regularization=100) data = np.matrix([feature[i] for i in range(feature.shape[0])]) clf.fit(data, y) joblib.dump(clf, "train_model.pkl") if compute_importance: feature_importance(clf, data, y)
def test_zero_division_hessians(data): # non regression test for issue #14018 # make sure we avoid zero division errors when computing the leaves values. # If the learning rate is too high, the raw predictions are bad and will # saturate the softmax (or sigmoid in binary classif). This leads to # probabilities being exactly 0 or 1, gradients being constant, and # hessians being zero. X, y = data gb = HistGradientBoostingClassifier(learning_rate=100, max_iter=10) gb.fit(X, y)
def test_early_stopping_on_test_set_with_warm_start(): # Non regression test for #16661 where second fit fails with # warm_start=True, early_stopping is on, and no validation set X, y = make_classification(random_state=0) gb = HistGradientBoostingClassifier( max_iter=1, scoring='loss', warm_start=True, early_stopping=True, n_iter_no_change=1, validation_fraction=None) gb.fit(X, y) # does not raise on second call gb.set_params(max_iter=2) gb.fit(X, y)
def test_same_predictions_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 256 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5, n_informative=5, n_redundant=0, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = HistGradientBoostingClassifier( loss='binary_crossentropy', max_iter=max_iter, max_bins=max_bins, learning_rate=1, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm') est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) assert np.mean(pred_sklearn == pred_lightgbm) > .89 acc_lightgbm = accuracy_score(y_train, pred_lightgbm) acc_sklearn = accuracy_score(y_train, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) assert np.mean(pred_sklearn == pred_lightgbm) > .89 acc_lightgbm = accuracy_score(y_test, pred_lightgbm) acc_sklearn = accuracy_score(y_test, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
def common_test_model_hgb_classifier(self, add_nan=False, n_classes=2): model = HistGradientBoostingClassifier(max_iter=5, max_depth=2) X, y = make_classification(n_features=10, n_samples=1000, n_informative=4, n_classes=n_classes, random_state=42) if add_nan: rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3) cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3) X[rows, cols] = numpy.nan X_train, X_test, y_train, _ = train_test_split(X, y, test_size=0.5, random_state=42) model.fit(X_train, y_train) if n_classes == 2: model_onnx = convert_sklearn( model, "unused", [("input", FloatTensorType([None, X.shape[1]]))], options={model.__class__: { 'raw_scores': True }}) self.assertIsNotNone(model_onnx) X_test = X_test.astype(numpy.float32)[:5] dump_data_and_model( X_test, model, model_onnx, basename="SklearnHGBClassifierRaw%s%d" % ("nan" if add_nan else '', n_classes), verbose=False, intermediate_steps=True, methods=['predict', 'decision_function_binary'], backend=['python']) model_onnx = convert_sklearn( model, "unused", [("input", FloatTensorType([None, X.shape[1]]))]) self.assertIsNotNone(model_onnx) X_test = X_test.astype(numpy.float32)[:5] dump_data_and_model(X_test, model, model_onnx, basename="SklearnHGBClassifier%s%d" % ("nan" if add_nan else '', n_classes), verbose=False)
def test_missing_values_trivial(): # sanity check for missing values support. With only one feature and # y == isnan(X), the gbdt is supposed to reach perfect accuracy on the # training set. n_samples = 100 n_features = 1 rng = np.random.RandomState(0) X = rng.normal(size=(n_samples, n_features)) mask = rng.binomial(1, 0.5, size=X.shape).astype(bool) X[mask] = np.nan y = mask.ravel() gb = HistGradientBoostingClassifier() gb.fit(X, y) assert gb.score(X, y) == pytest.approx(1)
def k_fold_cross_val(Xs, y_var, k=10): clf = tree.DecisionTreeClassifier() clf_forest = RandomForestClassifier(n_estimators=10) clf_boost = HistGradientBoostingClassifier() num_folds = k N = Xs.shape[0] test_size = int(N / num_folds) test_idxs = np.random.permutation(N)[:num_folds * test_size].reshape( num_folds, test_size) total_score = np.asarray([0., 0., 0.]) total_F1_score = np.asarray([0., 0., 0.]) for i in range(num_folds): print("Iteration " + str(i) + ":") test_i = Xs.index.isin(test_idxs[i]) df_train, df_test = Xs[~test_i], Xs[test_i] y_train, y_test = y_var[~test_i], y_var[test_i] clf = clf.fit(df_train.to_numpy(), y_train.to_numpy().ravel()) score_b = clf.score(df_test.to_numpy(), y_test.to_numpy().ravel()) clf_forest = clf_forest.fit(df_train.to_numpy(), y_train.to_numpy().ravel()) score_f = clf_forest.score(df_test.to_numpy(), y_test.to_numpy().ravel()) clf_boost = clf_boost.fit(df_train.to_numpy(), y_train.to_numpy().ravel()) score_h = clf_boost.score(df_test.to_numpy(), y_test.to_numpy().ravel()) y_hat = clf.predict(df_test.to_numpy()) f1_b = f1_score(y_test.to_numpy().ravel(), y_hat, average='binary') print("F1 score (tree):", f1_b) y_hat = clf_forest.predict(df_test.to_numpy()) f1_f = f1_score(y_test.to_numpy().ravel(), y_hat, average='binary') print("F1 score (forest):", f1_f) y_hat = clf_boost.predict(df_test.to_numpy()) f1_boost = f1_score(y_test.to_numpy().ravel(), y_hat, average='binary') print("F1 score (boost):", f1_boost) print("Prediction scores for (tree,forest,boost):", score_b, score_f, score_h) total_score += np.asarray([score_b, score_f, score_h]) total_F1_score += np.asarray([f1_b, f1_f, f1_boost]) print("Avg. accuracy scores for (tree,forest,boost):", total_score / num_folds) print("Avg. F1 scores for (tree,forest,boost):", total_F1_score / num_folds) return clf, clf_forest, clf_boost
def test_categorical_encoding_strategies(): # Check native categorical handling vs different encoding strategies. We # make sure that native encoding needs only 1 split to achieve a perfect # prediction on a simple dataset. In contrast, OneHotEncoded data needs # more depth / splits, and treating categories as ordered (just using # OrdinalEncoder) requires even more depth. # dataset with one random continuous feature, and one categorical feature # with values in [0, 5], e.g. from an OrdinalEncoder. # class == 1 iff categorical value in {0, 2, 4} rng = np.random.RandomState(0) n_samples = 10_000 f1 = rng.rand(n_samples) f2 = rng.randint(6, size=n_samples) X = np.c_[f1, f2] y = np.zeros(shape=n_samples) y[X[:, 1] % 2 == 0] = 1 # make sure dataset is balanced so that the baseline_prediction doesn't # influence predictions too much with max_iter = 1 assert 0.49 < y.mean() < 0.51 clf_cat = HistGradientBoostingClassifier( max_iter=1, max_depth=1, categorical_features=[False, True] ) # Using native categorical encoding, we get perfect predictions with just # one split assert cross_val_score(clf_cat, X, y).mean() == 1 # quick sanity check for the bitset: 0, 2, 4 = 2**0 + 2**2 + 2**4 = 21 expected_left_bitset = [21, 0, 0, 0, 0, 0, 0, 0] left_bitset = clf_cat.fit(X, y)._predictors[0][0].raw_left_cat_bitsets[0] assert_array_equal(left_bitset, expected_left_bitset) # Treating categories as ordered, we need more depth / more splits to get # the same predictions clf_no_cat = HistGradientBoostingClassifier( max_iter=1, max_depth=4, categorical_features=None ) assert cross_val_score(clf_no_cat, X, y).mean() < 0.9 clf_no_cat.set_params(max_depth=5) assert cross_val_score(clf_no_cat, X, y).mean() == 1 # Using OHEd data, we need less splits than with pure OEd data, but we # still need more splits than with the native categorical splits ct = make_column_transformer( (OneHotEncoder(sparse=False), [1]), remainder="passthrough" ) X_ohe = ct.fit_transform(X) clf_no_cat.set_params(max_depth=2) assert cross_val_score(clf_no_cat, X_ohe, y).mean() < 0.9 clf_no_cat.set_params(max_depth=3) assert cross_val_score(clf_no_cat, X_ohe, y).mean() == 1
def foo(Xs, Xg, ys, yg): # For each subset, make splits kf = KFold(n_splits=n_splits, random_state=0) # Store ITEs ite = [] # For each split for idx1, idx2 in kf.split(Xs): # Init models ms = HistGradientBoostingRegressor() mg = HistGradientBoostingClassifier() # Train models ms.fit(Xs[idx1], ys[idx1]) mg.fit(Xg[idx1], yg[idx1]) # Make estimates on test set ite.append( AIPW_estimator(ms, mg, Xs[idx2], Xg[idx2], ys[idx2], yg[idx2])) # Return mean ite and n_employees return np.concatenate(ite).mean(), len(Xs)
def gradient_boost(train_data, test_data): train_y = train_data['state'] train_X = train_data.iloc[:, FEATURES_INDICES] test_y = test_data['state'] test_X = test_data.iloc[:, FEATURES_INDICES] #search(train_X, train_y) #search_xgboost(train_X, train_y) gd = HistGradientBoostingClassifier(loss='auto', max_bins=200, max_depth=10, max_leaf_nodes=35) #gd = XGBClassifier() gd.fit(train_X, train_y) pred_y = gd.predict(test_X) evaluate(gd, test_X, test_y, pred_y)
def main(): # loading the dataset from sklearn.datasets df_cancer = load_breast_cancer() print(df_cancer.keys()) X = df_cancer.data y = df_cancer.target print("number of classes are: ", np.unique(y)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) # create object of historgradientboosting hist = HistGradientBoostingClassifier() # training the model hist.fit(X_train, y_train) y_pred = hist.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print("accuracy of the model is: ", accuracy) clr = classification_report(y_test, y_pred) print("Classification report is:", clr)
def fit( self, params, X_train, y_train, Xy_val, sample_weight, n_estimators=None, seed=None, ): # Xy_val not used if seed is not None: params.update({"random_state": seed}) if n_estimators is not None: params.update({"max_iter": n_estimators}) clf = HistGradientBoostingClassifier( **params, categorical_features=self.categorical_features, early_stopping="auto") clf.fit(X_train, y_train, sample_weight=sample_weight) return clf, None
def test_on_target(rawdata, sitename): print('------------Testing on %s-----------' % sitename) target_info = pd.read_csv("target_info.csv") if sitename in target_info['Site'].values: target_dict = target_info.set_index('Site').T.to_dict() sequence = target_dict[sitename]['Sequence'] train_data = rawdata[rawdata[0]!=sequence] test_data = rawdata[rawdata[0]==sequence] X_train, y_train = create_Input(train_data) X_test, y_test = create_Input(test_data) pmfm = create_pmfm(X_train,y_train) train_feature = X_train.apply(feature_Encoding, axis = 1, args = (0,1,pmfm)).values test_feature = X_test.apply(feature_Encoding, axis = 1, args = (0,1,pmfm)).values clf = HistGradientBoostingClassifier(max_iter=500, learning_rate=0.9, max_depth=6, l2_regularization=100) train_matrix = np.matrix([train_feature[i] for i in range(train_feature.shape[0])]) test_matrix = np.matrix([test_feature[i] for i in range(test_feature.shape[0])]) clf.fit(train_matrix, y_train) pred = clf.predict(test_matrix) evaluate(y_test, pred) else: print('ERROR: INCORRECT SITE NAME')
class MyHGBClassifierModel(BaseModel): def __init__(self, model_params, fit_params: Optional[Dict]): self.model_params = model_params self.fit_params = fit_params if self.fit_params is None: self.fit_params = {} def build_model(self): self.model = HistGradientBoostingClassifier(**self.model_params) return self.model def fit(self, train_x, train_y, valid_x=None, valid_y=None): self.model = self.build_model() self.model.fit( train_x, train_y, **self.fit_params ) return self.model def predict(self, est, valid_x): preds = est.predict_proba(valid_x)[:, 1] return preds
def test_early_stopping_classification(data, scoring, validation_fraction, n_iter_no_change, tol): max_iter = 50 X, y = data gb = HistGradientBoostingClassifier( verbose=1, # just for coverage min_samples_leaf=5, # easier to overfit fast scoring=scoring, tol=tol, validation_fraction=validation_fraction, max_iter=max_iter, n_iter_no_change=n_iter_no_change, random_state=0 ) gb.fit(X, y) if n_iter_no_change is not None: assert n_iter_no_change <= gb.n_iter_ < max_iter else: assert gb.n_iter_ == max_iter
def test_binning_train_validation_are_separated(): # Make sure training and validation data are binned separately. # See issue 13926 rng = np.random.RandomState(0) validation_fraction = .2 gb = HistGradientBoostingClassifier( n_iter_no_change=5, validation_fraction=validation_fraction, random_state=rng ) gb.fit(X_classification, y_classification) mapper_training_data = gb.bin_mapper_ # Note that since the data is small there is no subsampling and the # random_state doesn't matter mapper_whole_data = _BinMapper(random_state=0) mapper_whole_data.fit(X_classification) n_samples = X_classification.shape[0] assert np.all(mapper_training_data.actual_n_bins_ == int((1 - validation_fraction) * n_samples)) assert np.all(mapper_training_data.actual_n_bins_ != mapper_whole_data.actual_n_bins_)
def common_test_model_hgb_classifier(self, add_nan=False, n_classes=2): model = HistGradientBoostingClassifier(max_iter=5, max_depth=2) X, y = make_classification(n_features=10, n_samples=1000, n_informative=4, n_classes=n_classes, random_state=42) if add_nan: rows = numpy.random.randint(0, X.shape[0] - 1, X.shape[0] // 3) cols = numpy.random.randint(0, X.shape[1] - 1, X.shape[0] // 3) X[rows, cols] = numpy.nan X_train, X_test, y_train, _ = train_test_split(X, y, test_size=0.5, random_state=42) model.fit(X_train, y_train) model_onnx = convert_sklearn( model, "unused", [("input", FloatTensorType([None, X.shape[1]]))]) self.assertIsNotNone(model_onnx) X_test = X_test.astype(numpy.float32)[:5] dump_data_and_model(X_test, model, model_onnx, folder=self.folder)
def test_infinite_values_missing_values(): # High level test making sure that inf and nan values are properly handled # when both are present. This is similar to # test_split_on_nan_with_infinite_values() in test_grower.py, though we # cannot check the predicitons for binned values here. X = np.asarray([-np.inf, 0, 1, np.inf, np.nan]).reshape(-1, 1) y_isnan = np.isnan(X.ravel()) y_isinf = X.ravel() == np.inf stump_clf = HistGradientBoostingClassifier(min_samples_leaf=1, max_iter=1, learning_rate=1, max_depth=2) assert stump_clf.fit(X, y_isinf).score(X, y_isinf) == 1 assert stump_clf.fit(X, y_isnan).score(X, y_isnan) == 1
def clasificar_HistGradientBoostingClassifier(X, y, df, trainInputs, trainOutputs, testInputs, testOutputs, graphname): print("\n[" + str(graphname) + "]") scoreArray = np.array([]) clf = HistGradientBoostingClassifier() scores = cross_val_score(clf, X, y, cv=10) clf = clf.fit(trainInputs, trainOutputs) precisionTrain = clf.score(trainInputs, trainOutputs) precisionTest = clf.score(testInputs, testOutputs) print("\tCCR train = %.2f%% | CCR test = %.2f%%" % (precisionTrain * 100, precisionTest * 100)) prediccion_test = clf.predict(testInputs) print(prediccion_test) print(testOutputs) return precisionTest
data_train, target_train = data_train[:subsample], target_train[:subsample] n_samples, n_features = data_train.shape print(f"Training set with {n_samples} records with {n_features} features.") print("Fitting a sklearn model...") tic = time() est = HistGradientBoostingClassifier(loss='binary_crossentropy', learning_rate=lr, max_iter=n_trees, max_bins=max_bins, max_leaf_nodes=n_leaf_nodes, n_iter_no_change=None, random_state=0, verbose=1) est.fit(data_train, target_train) toc = time() predicted_test = est.predict(data_test) predicted_proba_test = est.predict_proba(data_test) roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1]) acc = accuracy_score(target_test, predicted_test) print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}") if args.lightgbm: print("Fitting a LightGBM model...") tic = time() lightgbm_est = get_equivalent_estimator(est, lib='lightgbm') lightgbm_est.fit(data_train, target_train) toc = time() predicted_test = lightgbm_est.predict(data_test) predicted_proba_test = lightgbm_est.predict_proba(data_test)
# ############################################################ BaggingClassifier clf_bc = BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=0) clf_bc.fit(x_train, y_train) bc_pred = clf_bc.predict(x_test) bc_matrices = evaluate_preds(clf_bc, x_test, y_test, bc_pred) # ################################################ ExtraTreesClassifier clf_etc = ExtraTreesClassifier() clf_etc.fit(x_train, y_train) etc_pred = clf_etc.predict(x_test) et_matrices = evaluate_preds(clf_etc, x_test, y_test, etc_pred) # ############################################################ # ############################################################ HistGradientBoostingClassifier clf_hgbc = HistGradientBoostingClassifier() clf_hgbc.fit(x_train, y_train) hgbc_pred = clf_hgbc.predict(x_test) hgb_matrices = evaluate_preds(clf_hgbc, x_test, y_test, hgbc_pred) # ############################################################ # ############################################################ LogisticRegression clf_lr = LogisticRegression() clf_lr.fit(x_train, y_train) clf_pred = clf_lr.predict(x_test) lr_matrices = evaluate_preds(clf_lr, x_test, y_test, clf_pred) # ############################################################ # ############################################################ StackingClassifier clf_sc = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()) clf_sc.fit(x_train, y_train) clf_pred = clf_sc.predict(x_test) sc_matrices = evaluate_preds(clf_sc, x_test, y_test, clf_pred)
multi_clf.fit(patient_train_data, train_labels.to_numpy()[:, 1:11]) res = 1 / (1 + np.exp(-multi_clf.decision_function(patient_test_data))) task1_df = pd.DataFrame(data=res, columns=train_labels.columns[1:11]) task1_df.to_csv('subtask1.csv', index=False, header=True) t2 = time.time() print('subtask1, time taken: ', t2 - t1) print(task1_df) #subtask 2 t1 = time.time() clf.fit(patient_train_data, train_labels.to_numpy()[:, 11]) res = 1 / (1 + np.exp(-clf.decision_function(patient_test_data))) task2_df = pd.DataFrame(data=res, columns=[train_labels.columns[11]]) task2_df.to_csv('subtask2.csv', index=False, header=True) t2 = time.time() print('subtask2, time taken: ', t2 - t1) print(task2_df) #subtask 3 t1 = time.time() reg = HistGradientBoostingRegressor(random_state=1510)