def test_bagging_classifier_with_missing_inputs(): # Check that BaggingClassifier can accept X with missing/infinite data X = np.array([ [1, 3, 5], [2, None, 6], [2, np.nan, 6], [2, np.inf, 6], [2, np.NINF, 6], ]) y = np.array([3, 6, 6, 6, 6]) classifier = DecisionTreeClassifier() pipeline = make_pipeline(FunctionTransformer(replace), classifier) pipeline.fit(X, y).predict(X) bagging_classifier = BaggingClassifier(pipeline) bagging_classifier.fit(X, y) y_hat = bagging_classifier.predict(X) assert y.shape == y_hat.shape bagging_classifier.predict_log_proba(X) bagging_classifier.predict_proba(X) # Verify that exceptions can be raised by wrapper classifier classifier = DecisionTreeClassifier() pipeline = make_pipeline(classifier) with pytest.raises(ValueError): pipeline.fit(X, y) bagging_classifier = BaggingClassifier(pipeline) with pytest.raises(ValueError): bagging_classifier.fit(X, y)
def test_bagging_classifier_with_missing_inputs(): # Check that BaggingClassifier can accept X with missing/infinite data X = np.array([ [1, 3, 5], [2, None, 6], [2, np.nan, 6], [2, np.inf, 6], [2, np.NINF, 6], ]) y = np.array([3, 6, 6, 6, 6]) classifier = DecisionTreeClassifier() pipeline = make_pipeline( FunctionTransformer(replace, validate=False), classifier ) pipeline.fit(X, y).predict(X) bagging_classifier = BaggingClassifier(pipeline) bagging_classifier.fit(X, y) y_hat = bagging_classifier.predict(X) assert_equal(y.shape, y_hat.shape) bagging_classifier.predict_log_proba(X) bagging_classifier.predict_proba(X) # Verify that exceptions can be raised by wrapper classifier classifier = DecisionTreeClassifier() pipeline = make_pipeline(classifier) assert_raises(ValueError, pipeline.fit, X, y) bagging_classifier = BaggingClassifier(pipeline) assert_raises(ValueError, bagging_classifier.fit, X, y)
def test_probability(): # Predict probabilities. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) with np.errstate(divide="ignore", invalid="ignore"): # Normal case ensemble = BaggingClassifier(base_estimator=DecisionTreeClassifier(), random_state=rng).fit(X_train, y_train) assert_array_almost_equal( np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))) assert_array_almost_equal(ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test))) # Degenerate case, where some classes are missing ensemble = BaggingClassifier(base_estimator=LogisticRegression(), random_state=rng, max_samples=5).fit(X_train, y_train) assert_array_almost_equal( np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))) assert_array_almost_equal(ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test)))
def test_probability(): # Predict probabilities. rng = check_random_state(0) X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) with np.errstate(divide="ignore", invalid="ignore"): # Normal case ensemble = BaggingClassifier(base_estimator=DecisionTreeClassifier(), random_state=rng).fit(X_train, y_train) assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))) assert_array_almost_equal(ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test))) # Degenerate case, where some classes are missing ensemble = BaggingClassifier(base_estimator=LogisticRegression(), random_state=rng, max_samples=5).fit(X_train, y_train) assert_array_almost_equal(np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))) assert_array_almost_equal(ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test)))
class BaggedDecisionTreeClassifier(): def __init__(self, n_estimators=20, bootstrap=True, bootstrap_features=False, oob_score=False, max_depth=None, min_samples_leaf=20, warm_start=False, n_jobs=None, early_stopping='auto', verbose=0, random_state=None): self.tree = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf) self.BagDT = BaggingClassifier(base_estimator=self.tree, n_estimators=n_estimators, bootstrap=bootstrap, bootstrap_features=bootstrap_features, oob_score=oob_score, warm_start=warm_start, n_jobs=n_jobs, random_state=random_state, verbose=verbose) def decision_function(self, X): return self.BagDT.decision_function(X) def fit(self, X, y, sample_weight=None): self.BagDT.fit(X, y, sample_weight=sample_weight) return self.BagDT def get_params(self, deep=True): return self.BagDT.get_params(deep=deep) def predict(self, X): return self.BagDT.predict(X) def predict_log_proba(self, X): return self.BagDT.predict_log_proba(X) def predict_proba(self, X): return self.BagDT.predict_proba(X) def score(self, X, y, sample_weight=None): return self.BagDT.score(X, y, sample_weight=sample_weight) def set_params(self, **params): return self.BagDT.set_params(**params)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=214) # 创建模型对象 dt = DecisionTreeClassifier(criterion='gini', max_depth=5) algo = BaggingClassifier(base_estimator=dt, n_estimators=10, oob_score=True) # 模型训练 algo.fit(X_train, y_train) # 7. 模型效果评估 print('训练集上数据准确率:{}'.format(algo.score(X_train, y_train))) print('测试集上数据准确率:{}'.format(algo.score(X_test, y_test))) # 8. 看下属性API X_test = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3], [5.2, 3.4, 1.4, 0.2]] print('样本预测值:') print(algo.predict(X_test)) print('样本的预测概率:') print(algo.predict_proba(X_test)) print('样本预测概率值得log转换值:') print(algo.predict_log_proba(X_test)) print('训练好的所有子模型:\n{}'.format(algo.estimators_)) X_test = [[6.9, 3.1, 5.1, 2.3], [6.1, 2.8, 4.0, 1.3], [5.2, 3.4, 1.4, 0.2]] for k, estimators in enumerate(algo.estimators_): print('第{}个子模型对于数据的预测值为:{}'.format(k + 1, estimators.predict(X_test))) print('各个子模型的训练数据使用的特征属性:{}'.format(algo.estimators_features_)) print('Bagging模型的袋外准确率:{}'.format(algo.oob_score_)) # 所有子模型可视化
class _BaggingClassifierImpl: def __init__( self, base_estimator=None, n_estimators=10, *, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0, ): estimator_impl = base_estimator self._hyperparams = { "base_estimator": estimator_impl, "n_estimators": n_estimators, "max_samples": max_samples, "max_features": max_features, "bootstrap": bootstrap, "bootstrap_features": bootstrap_features, "oob_score": oob_score, "warm_start": warm_start, "n_jobs": n_jobs, "random_state": random_state, "verbose": verbose, } self._wrapped_model = SKLModel(**self._hyperparams) self._hyperparams["base_estimator"] = base_estimator def get_params(self, deep=True): out = self._wrapped_model.get_params(deep=deep) # we want to return the lale operator, not the underlying impl out["base_estimator"] = self._hyperparams["base_estimator"] return out def fit(self, X, y, sample_weight=None): if isinstance(X, pd.DataFrame): feature_transformer = FunctionTransformer( func=lambda X_prime: pd.DataFrame(X_prime, columns=X.columns), inverse_func=None, check_inverse=False, ) self._hyperparams["base_estimator"] = ( feature_transformer >> self._hyperparams["base_estimator"]) self._wrapped_model = SKLModel(**self._hyperparams) self._wrapped_model.fit(X, y, sample_weight) return self def predict(self, X, **predict_params): return self._wrapped_model.predict(X, **predict_params) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def predict_log_proba(self, X): return self._wrapped_model.predict_log_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X) def score(self, X, y, sample_weight=None): return self._wrapped_model.score(X, y, sample_weight)
class HistRandomForestClassifier(): def __init__(self, loss='auto', max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, l2_regularization=0, max_bins=255, n_estimators=20, max_samples=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, categorical_features=None, monotonic_cst=None, warm_start=False, n_jobs=None, early_stopping='auto', scoring='loss', validation_fraction=0.1, n_iter_no_change=10, tol=1e-7, verbose=0, random_state=None): self.loss = loss self.max_leaf_nodes = max_leaf_nodes self.max_depth = max_depth self.min_samples_leaf = min_samples_leaf self.l2_regularization = l2_regularization self.max_bins = max_bins self.n_estimators = n_estimators self.max_samples = max_samples self.bootstrap = bootstrap self.bootstrap_features = bootstrap_features self.oob_score = oob_score self.categorical_features = categorical_features self.monotonic_cst = monotonic_cst self.warm_start = warm_start self.n_jobs = n_jobs self.early_stopping = early_stopping self.scoring = scoring self.validation_fraction = validation_fraction self.n_iter_no_change = n_iter_no_change self.tol = tol self.verbose = verbose self.random_state = random_state self.tree = HistGradientBoostingClassifier( loss=loss, learning_rate=1, max_iter=1, max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, min_samples_leaf=min_samples_leaf, l2_regularization=l2_regularization, max_bins=max_bins, categorical_features=categorical_features, monotonic_cst=monotonic_cst, early_stopping=early_stopping, scoring=scoring, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose, random_state=random_state) self.HistRF = BaggingClassifier(base_estimator=self.tree, n_estimators=n_estimators, bootstrap=bootstrap, bootstrap_features=bootstrap_features, oob_score=oob_score, warm_start=warm_start, n_jobs=n_jobs, random_state=random_state, verbose=verbose) def decision_function(self, X): return self.HistRF.decision_function(X) def fit(self, X, y, sample_weight=None): self.HistRF.fit(X, y, sample_weight=sample_weight) return self.HistRF def get_params(self, deep=True): return self.HistRF.get_params(deep=deep) def predict(self, X): return self.HistRF.predict(X) def predict_log_proba(self, X): return self.HistRF.predict_log_proba(X) def predict_proba(self, X): return self.HistRF.predict_proba(X) def score(self, X, y, sample_weight=None): return self.HistRF.score(X, y, sample_weight=sample_weight) def set_params(self, **params): return self.HistRF.set_params(**params)