def test_min_impurity_decrease(): X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) all_estimators = [RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor] for Estimator in all_estimators: est = Estimator(min_impurity_decrease=0.1) est.fit(X, y) for tree in est.estimators_: # Simply check if the parameter is passed on correctly. Tree tests # will suffice for the actual working of this param assert tree.min_impurity_decrease == 0.1
def test_min_impurity_split(): # Test if min_impurity_split of base estimators is set # Regression test for #8006 X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) all_estimators = [RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor] for Estimator in all_estimators: est = Estimator(min_impurity_split=0.1) est = assert_warns_message(DeprecationWarning, "min_impurity_decrease", est.fit, X, y) for tree in est.estimators_: assert tree.min_impurity_split == 0.1
.. [1] T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical Learning Ed. 2", Springer, 2009. """ print(__doc__) # Author: Peter Prettenhofer <*****@*****.**> # # License: BSD 3 clause import numpy as np import matplotlib.pyplot as plt from mrex import ensemble from mrex import datasets X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X = X.astype(np.float32) # map labels from {-1, 1} to {0, 1} labels, y = np.unique(y, return_inverse=True) X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] original_params = { 'n_estimators': 1000, 'max_leaf_nodes': 4, 'max_depth': None, 'random_state': 2, 'min_samples_split': 5 }
# and randomly permute it iris = datasets.load_iris() rng = check_random_state(0) perm = rng.permutation(iris.target.size) iris.data = iris.data[perm] iris.target = iris.target[perm] # also load the boston dataset # and randomly permute it boston = datasets.load_boston() perm = rng.permutation(boston.target.size) boston.data = boston.data[perm] boston.target = boston.target[perm] # also make a hastie_10_2 dataset hastie_X, hastie_y = datasets.make_hastie_10_2(n_samples=20, random_state=1) hastie_X = hastie_X.astype(np.float32) # Get the default backend in joblib to test parallelism and interaction with # different backends DEFAULT_JOBLIB_BACKEND = joblib.parallel.get_active_backend()[0].__class__ FOREST_CLASSIFIERS = { "ExtraTreesClassifier": ExtraTreesClassifier, "RandomForestClassifier": RandomForestClassifier, } FOREST_REGRESSORS = { "ExtraTreesRegressor": ExtraTreesRegressor, "RandomForestRegressor": RandomForestRegressor, }
# License: BSD 3 clause import time import numpy as np import matplotlib.pyplot as plt from mrex import ensemble from mrex import datasets from mrex.model_selection import train_test_split print(__doc__) data_list = [datasets.load_iris(), datasets.load_digits()] data_list = [(d.data, d.target) for d in data_list] data_list += [datasets.make_hastie_10_2()] names = ['Iris Data', 'Digits Data', 'Hastie Data'] n_gb = [] score_gb = [] time_gb = [] n_gbes = [] score_gbes = [] time_gbes = [] n_estimators = 500 for X, y in data_list: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
from matplotlib import pyplot as plt from mrex.datasets import make_hastie_10_2 from mrex.model_selection import GridSearchCV from mrex.metrics import make_scorer from mrex.metrics import accuracy_score from mrex.tree import DecisionTreeClassifier print(__doc__) ############################################################################### # Running ``GridSearchCV`` using multiple evaluation metrics # ---------------------------------------------------------- # X, y = make_hastie_10_2(n_samples=8000, random_state=42) # The scorers can be either be one of the predefined metric strings or a scorer # callable, like the one returned by make_scorer scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)} # Setting refit='AUC', refits an estimator on the whole dataset with the # parameter setting that has the best cross-validated AUC score. # That estimator is made available at ``gs.best_estimator_`` along with # parameters like ``gs.best_score_``, ``gs.best_params_`` and # ``gs.best_index_`` gs = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid={'min_samples_split': range(2, 403, 10)}, scoring=scoring, refit='AUC', return_train_score=True)
def test_make_hastie_10_2(): X, y = make_hastie_10_2(n_samples=100, random_state=0) assert X.shape == (100, 10), "X shape mismatch" assert y.shape == (100, ), "y shape mismatch" assert np.unique(y).shape == (2, ), "Unexpected number of classes"