def test_warm_start_smaller_n_estimators(): # Test if warm start'ed second fit with smaller n_estimators raises error. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf = BaggingClassifier(n_estimators=5, warm_start=True) clf.fit(X, y) clf.set_params(n_estimators=4) assert_raises(ValueError, clf.fit, X, y)
class RFsLDA(BaseEstimator, ClassifierMixin): def __init__(self, n_sLDA_estimators=200, n_estimators_per_RF=20): self.n_sLDA_estimators_ = n_sLDA_estimators self.n_estimators_per_RF_ = n_estimators_per_RF self.estimator_ = BaggingClassifier( base_estimator=IntWeightedShrinkageLDA(), n_estimators=n_sLDA_estimators, warm_start=True) def fit(self, X, y): Xc = self.normalise_(X) self.estimator_.fit(Xc, y) self.estimator_.set_params( base_estimator=RandomForestClassifier( n_estimators=self.n_estimators_per_RF_, ), bootstrap=False, ) for k in range(self.n_sLDA_estimators_): self.estimator_.set_params(n_estimators=self.n_sLDA_estimators_ + 1 + k) wrong = self.estimator_.estimators_[k] != y self.estimator_.fit(Xc[wrong, :], y[wrong]) def predict(self, X): return self.estimator_.predict(self.normalise_(X)) def normalise_(self, X): return (X - np.median(X, axis=1, keepdims=True)) / \ np.median(np.abs(X - np.median(X, axis=1, keepdims=True)), axis=1, keepdims=True)
class Bagging(object): def __init__(self, dataset_x, dataset_y): self.dataset_x = dataset_x self.dataset_y = dataset_y self.clf = BaggingClassifier() self.best_parameter = {} def startBagging(self): print("------------------ Bagging Classifier -------------------") # self.findBestParameters() # self.gridSearch() self.randomSearch() def findBestParameters(self): """ Try different parameters for finding the best score :return: """ self.clf = BaggingClassifier() scores = cross_val_score(self.clf, self.dataset_x, self.dataset_y, cv=10, scoring="accuracy") print(scores) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) def test(self): """ Test the model with best parameters found in randomSearch() or gridSearch() :return: """ # self.clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=30, bootstrap=False, max_samples=0.9) self.clf = BaggingClassifier() self.clf.set_params(**self.best_parameter) print("*** Test Result for Bagging ***") ModelEvaluation.evaluateModelWithCV(self.clf, self.dataset_x, self.dataset_y, cv=10) def randomSearch(self): tuned_parameters = {'base_estimator': [DecisionTreeClassifier(), SVC(), LogisticRegression(), KNeighborsClassifier(), MultinomialNB()], 'n_estimators': [5, 10, 15, 20, 30, 50], 'max_samples': [0.5, 0.7, 0.9], 'bootstrap': [True, False] } self.best_parameter = SearchParameters.randomSearch(classifier=self.clf, parameters=tuned_parameters, cv=10, n_iter=30, train_x=self.dataset_x, train_y=self.dataset_y) def gridSearch(self): tuned_parameters = { 'base_estimator': [DecisionTreeClassifier(), SVC(), LogisticRegression(), KNeighborsClassifier(), MultinomialNB()], 'n_estimators': [5, 10, 15, 20], 'max_samples': [0.3, 0.5, 0.7, 0.9], 'max_features': [0.5, 1.0], 'bootstrap': [True, False] } self.best_parameter = SearchParameters.gridSearch(classifier=self.clf, parameters=tuned_parameters, cv=10, train_x=self.dataset_x, train_y=self.dataset_y)
def test_oob_score_removed_on_warm_start(): X, y = make_hastie_10_2(n_samples=2000, random_state=1) clf = BaggingClassifier(n_estimators=50, oob_score=True) clf.fit(X, y) clf.set_params(warm_start=True, oob_score=False, n_estimators=100) clf.fit(X, y) assert_raises(AttributeError, getattr, clf, "oob_score_")
def test_parallel_classification(): # Check parallel classification. rng = check_random_state(0) # Classification X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=3, random_state=0).fit(X_train, y_train) # predict_proba ensemble.set_params(n_jobs=1) y1 = ensemble.predict_proba(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y3) # decision_function ensemble = BaggingClassifier(SVC(decision_function_shape="ovr"), n_jobs=3, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) decisions1 = ensemble.decision_function(X_test) ensemble.set_params(n_jobs=2) decisions2 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions2) ensemble = BaggingClassifier(SVC(decision_function_shape="ovr"), n_jobs=1, random_state=0).fit(X_train, y_train) decisions3 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions3)
def test_warm_start_equivalence(): # warm started classifier with 5+5 estimators should be equivalent to # one classifier with 10 estimators X, y = make_hastie_10_2(n_samples=20, random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43) clf_ws = BaggingClassifier(n_estimators=5, warm_start=True, random_state=3141) clf_ws.fit(X_train, y_train) clf_ws.set_params(n_estimators=10) clf_ws.fit(X_train, y_train) y1 = clf_ws.predict(X_test) clf = BaggingClassifier(n_estimators=10, warm_start=False, random_state=3141) clf.fit(X_train, y_train) y2 = clf.predict(X_test) assert_array_almost_equal(y1, y2)
def test_warm_start(random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = BaggingClassifier(n_estimators=n_estimators, random_state=random_state, warm_start=True) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert_equal(len(clf_ws), n_estimators) clf_no_ws = BaggingClassifier(n_estimators=10, random_state=random_state, warm_start=False) clf_no_ws.fit(X, y) assert_equal(set([tree.random_state for tree in clf_ws]), set([tree.random_state for tree in clf_no_ws]))
def test_parallel_classification(): # Check parallel classification. rng = check_random_state(0) # Classification X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=3, random_state=0).fit(X_train, y_train) # predict_proba ensemble.set_params(n_jobs=1) y1 = ensemble.predict_proba(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y3) # decision_function ensemble = BaggingClassifier(SVC(gamma='scale', decision_function_shape='ovr'), n_jobs=3, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) decisions1 = ensemble.decision_function(X_test) ensemble.set_params(n_jobs=2) decisions2 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions2) X_err = np.hstack((X_test, np.zeros((X_test.shape[0], 1)))) assert_raise_message( ValueError, "Number of features of the model " "must match the input. Model n_features is {0} " "and input n_features is {1} " "".format(X_test.shape[1], X_err.shape[1]), ensemble.decision_function, X_err) ensemble = BaggingClassifier(SVC(gamma='scale', decision_function_shape='ovr'), n_jobs=1, random_state=0).fit(X_train, y_train) decisions3 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions3)
def test_parallel_classification(): # Check parallel classification. rng = check_random_state(0) # Classification X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=3, random_state=0).fit(X_train, y_train) # predict_proba ensemble.set_params(n_jobs=1) y1 = ensemble.predict_proba(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y3) # decision_function ensemble = BaggingClassifier(SVC(gamma='scale', decision_function_shape='ovr'), n_jobs=3, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) decisions1 = ensemble.decision_function(X_test) ensemble.set_params(n_jobs=2) decisions2 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions2) X_err = np.hstack((X_test, np.zeros((X_test.shape[0], 1)))) assert_raise_message(ValueError, "Number of features of the model " "must match the input. Model n_features is {0} " "and input n_features is {1} " "".format(X_test.shape[1], X_err.shape[1]), ensemble.decision_function, X_err) ensemble = BaggingClassifier(SVC(gamma='scale', decision_function_shape='ovr'), n_jobs=1, random_state=0).fit(X_train, y_train) decisions3 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions3)
def test_parallel_classification(): # Check parallel classification. rng = check_random_state(0) # Classification X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=3, random_state=0).fit(X_train, y_train) # predict_proba ensemble.set_params(n_jobs=1) y1 = ensemble.predict_proba(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y3) # decision_function ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'), n_jobs=3, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) decisions1 = ensemble.decision_function(X_test) ensemble.set_params(n_jobs=2) decisions2 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions2) X_err = np.hstack((X_test, np.zeros((X_test.shape[0], 1)))) err_msg = (f"Number of features of the model must match the input. Model " f"n_features is {X_test.shape[1]} and input n_features is " f"{X_err.shape[1]} ") with pytest.raises(ValueError, match=err_msg): ensemble.decision_function(X_err) ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'), n_jobs=1, random_state=0).fit(X_train, y_train) decisions3 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions3)
def test_warm_start(random_state=42): # Test if fitting incrementally with warm start gives a forest of the # right size and the same results as a normal fit. X, y = make_hastie_10_2(n_samples=20, random_state=1) clf_ws = None for n_estimators in [5, 10]: if clf_ws is None: clf_ws = BaggingClassifier(n_estimators=n_estimators, random_state=random_state, warm_start=True) else: clf_ws.set_params(n_estimators=n_estimators) clf_ws.fit(X, y) assert len(clf_ws) == n_estimators clf_no_ws = BaggingClassifier(n_estimators=10, random_state=random_state, warm_start=False) clf_no_ws.fit(X, y) assert (set([tree.random_state for tree in clf_ws]) == set([tree.random_state for tree in clf_no_ws]))
def test_parallel_classification(): # Check parallel classification. X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=3, random_state=0).fit(X_train, y_train) # predict_proba y1 = ensemble.predict_proba(X_test) ensemble.set_params(n_jobs=1) y2 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y3) # decision_function ensemble = BaggingClassifier(SVC(decision_function_shape="ovr"), n_jobs=3, random_state=0).fit(X_train, y_train) decisions1 = ensemble.decision_function(X_test) ensemble.set_params(n_jobs=1) decisions2 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions2) ensemble = BaggingClassifier(SVC(decision_function_shape="ovr"), n_jobs=1, random_state=0).fit(X_train, y_train) decisions3 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions3)
class BaggedDecisionTreeClassifier(): def __init__(self, n_estimators=20, bootstrap=True, bootstrap_features=False, oob_score=False, max_depth=None, min_samples_leaf=20, warm_start=False, n_jobs=None, early_stopping='auto', verbose=0, random_state=None): self.tree = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf) self.BagDT = BaggingClassifier(base_estimator=self.tree, n_estimators=n_estimators, bootstrap=bootstrap, bootstrap_features=bootstrap_features, oob_score=oob_score, warm_start=warm_start, n_jobs=n_jobs, random_state=random_state, verbose=verbose) def decision_function(self, X): return self.BagDT.decision_function(X) def fit(self, X, y, sample_weight=None): self.BagDT.fit(X, y, sample_weight=sample_weight) return self.BagDT def get_params(self, deep=True): return self.BagDT.get_params(deep=deep) def predict(self, X): return self.BagDT.predict(X) def predict_log_proba(self, X): return self.BagDT.predict_log_proba(X) def predict_proba(self, X): return self.BagDT.predict_proba(X) def score(self, X, y, sample_weight=None): return self.BagDT.score(X, y, sample_weight=sample_weight) def set_params(self, **params): return self.BagDT.set_params(**params)
def test_parallel(): """Check parallel computations.""" rng = check_random_state(0) # Classification X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng) for n_jobs in [-1, 3]: ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=n_jobs, random_state=0).fit(X_train, y_train) # predict_proba ensemble.set_params(n_jobs=1) y1 = ensemble.predict_proba(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict_proba(X_test) assert_array_almost_equal(y1, y3) # decision_function ensemble = BaggingClassifier(SVC(), n_jobs=n_jobs, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) decisions1 = ensemble.decision_function(X_test) ensemble.set_params(n_jobs=2) decisions2 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions2) ensemble = BaggingClassifier(SVC(), n_jobs=1, random_state=0).fit(X_train, y_train) decisions3 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions3) # Regression X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng) for n_jobs in [-1, 3]: ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(X_train, y_train) ensemble.set_params(n_jobs=1) y1 = ensemble.predict(X_test) ensemble.set_params(n_jobs=2) y2 = ensemble.predict(X_test) assert_array_almost_equal(y1, y2) ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=1, random_state=0).fit(X_train, y_train) y3 = ensemble.predict(X_test) assert_array_almost_equal(y1, y3)
class Bagging(Classifier): r"""Implementation of bagging classifier. Date: 2020 Author: Luka Pečnik License: MIT Reference: L. Breiman, “Bagging predictors”, Machine Learning, 24(2), 123-140, 1996. Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html See Also: * :class:`niaaml.classifiers.Classifier` """ Name = 'Bagging' def __init__(self, **kwargs): r"""Initialize Bagging instance. """ warnings.filterwarnings(action='ignore', category=ChangedBehaviorWarning) warnings.filterwarnings(action='ignore', category=ConvergenceWarning) warnings.filterwarnings(action='ignore', category=DataConversionWarning) warnings.filterwarnings(action='ignore', category=DataDimensionalityWarning) warnings.filterwarnings(action='ignore', category=EfficiencyWarning) warnings.filterwarnings(action='ignore', category=FitFailedWarning) warnings.filterwarnings(action='ignore', category=NonBLASDotWarning) warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning) self._params = dict( n_estimators=ParameterDefinition(MinMax(min=10, max=111), np.uint), bootstrap=ParameterDefinition([True, False]), bootstrap_features=ParameterDefinition([True, False])) self.__bagging_classifier = BaggingClassifier() def set_parameters(self, **kwargs): r"""Set the parameters/arguments of the algorithm. """ self.__bagging_classifier.set_params(**kwargs) def fit(self, x, y, **kwargs): r"""Fit Bagging. Arguments: x (pandas.core.frame.DataFrame): n samples to classify. y (pandas.core.series.Series): n classes of the samples in the x array. Returns: None """ self.__bagging_classifier.fit(x, y) def predict(self, x, **kwargs): r"""Predict class for each sample (row) in x. Arguments: x (pandas.core.frame.DataFrame): n samples to classify. Returns: pandas.core.series.Series: n predicted classes. """ return self.__bagging_classifier.predict(x) def to_string(self): r"""User friendly representation of the object. Returns: str: User friendly representation of the object. """ return Classifier.to_string(self).format( name=self.Name, args=self._parameters_to_string( self.__bagging_classifier.get_params()))
class HistRandomForestClassifier(): def __init__(self, loss='auto', max_leaf_nodes=31, max_depth=None, min_samples_leaf=20, l2_regularization=0, max_bins=255, n_estimators=20, max_samples=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, categorical_features=None, monotonic_cst=None, warm_start=False, n_jobs=None, early_stopping='auto', scoring='loss', validation_fraction=0.1, n_iter_no_change=10, tol=1e-7, verbose=0, random_state=None): self.loss = loss self.max_leaf_nodes = max_leaf_nodes self.max_depth = max_depth self.min_samples_leaf = min_samples_leaf self.l2_regularization = l2_regularization self.max_bins = max_bins self.n_estimators = n_estimators self.max_samples = max_samples self.bootstrap = bootstrap self.bootstrap_features = bootstrap_features self.oob_score = oob_score self.categorical_features = categorical_features self.monotonic_cst = monotonic_cst self.warm_start = warm_start self.n_jobs = n_jobs self.early_stopping = early_stopping self.scoring = scoring self.validation_fraction = validation_fraction self.n_iter_no_change = n_iter_no_change self.tol = tol self.verbose = verbose self.random_state = random_state self.tree = HistGradientBoostingClassifier( loss=loss, learning_rate=1, max_iter=1, max_leaf_nodes=max_leaf_nodes, max_depth=max_depth, min_samples_leaf=min_samples_leaf, l2_regularization=l2_regularization, max_bins=max_bins, categorical_features=categorical_features, monotonic_cst=monotonic_cst, early_stopping=early_stopping, scoring=scoring, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, tol=tol, verbose=verbose, random_state=random_state) self.HistRF = BaggingClassifier(base_estimator=self.tree, n_estimators=n_estimators, bootstrap=bootstrap, bootstrap_features=bootstrap_features, oob_score=oob_score, warm_start=warm_start, n_jobs=n_jobs, random_state=random_state, verbose=verbose) def decision_function(self, X): return self.HistRF.decision_function(X) def fit(self, X, y, sample_weight=None): self.HistRF.fit(X, y, sample_weight=sample_weight) return self.HistRF def get_params(self, deep=True): return self.HistRF.get_params(deep=deep) def predict(self, X): return self.HistRF.predict(X) def predict_log_proba(self, X): return self.HistRF.predict_log_proba(X) def predict_proba(self, X): return self.HistRF.predict_proba(X) def score(self, X, y, sample_weight=None): return self.HistRF.score(X, y, sample_weight=sample_weight) def set_params(self, **params): return self.HistRF.set_params(**params)