def test_model_stacking_fit_transform(): model_stacking = Pipeline([ ModelStacking( [ SKLearnWrapper( GradientBoostingRegressor(), HyperparameterSpace({ "n_estimators": RandInt(50, 600), "max_depth": RandInt(1, 10), "learning_rate": LogUniform(0.07, 0.7) })), SKLearnWrapper( KMeans(), HyperparameterSpace({"n_clusters": RandInt(5, 10)})), ], joiner=NumpyTranspose(), judge=SKLearnWrapper( Ridge(), HyperparameterSpace({ "alpha": LogUniform(0.7, 1.4), "fit_intercept": Boolean() })), ) ]) expected_outputs_shape = (379, 1) data_inputs_shape = (379, 13) data_inputs = _create_data(data_inputs_shape) expected_outputs = _create_data(expected_outputs_shape) model_stacking, outputs = model_stacking.fit_transform( data_inputs, expected_outputs) assert outputs.shape == expected_outputs_shape
def test_automl_should_shallow_copy_data_before_each_epoch(): # see issue #332 https://github.com/Neuraxio/Neuraxle/issues/332 data_inputs = np.random.randint(0, 100, (100, 3)) expected_outputs = np.random.randint(0, 3, 100) from sklearn.preprocessing import StandardScaler p = Pipeline([ SKLearnWrapper(StandardScaler()), SKLearnWrapper(LinearSVC(), HyperparameterSpace({'C': RandInt(0, 10000)})), ]) auto_ml = AutoML(p, validation_splitter=ValidationSplitter(0.20), refit_trial=True, n_trials=10, epochs=10, cache_folder_when_no_handle='cache', scoring_callback=ScoringCallback( mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False) ], hyperparams_repository=InMemoryHyperparamsRepository( cache_folder='cache'), continue_loop_on_error=False) random_search = auto_ml.fit(data_inputs, expected_outputs) best_model = random_search.get_best_model() assert isinstance(best_model, Pipeline)
def test_sklearn_wrapper_fit_transform_with_predict(): p = SKLearnWrapper(LinearRegression()) data_inputs = np.expand_dims(np.array(list(range(10))), axis=-1) expected_outputs = np.expand_dims(np.array(list(range(10, 20))), axis=-1) p, outputs = p.fit_transform(data_inputs, expected_outputs) assert np.array_equal(outputs, expected_outputs)
def test_sklearn_wrapper_fit_transform_with_transform(): n_components = 2 p = SKLearnWrapper(PCA(n_components=n_components)) dim1 = 10 dim2 = 10 data_inputs, expected_outputs = _create_data_source((dim1, dim2)) p, outputs = p.fit_transform(data_inputs, expected_outputs) assert outputs.shape == (dim1, n_components)
def test_sklearn_wrapper_transform_partial_fit_with_predict(): model = SKLearnWrapper(SGDRegressor(), use_partial_fit=True) p = Pipeline([DataShuffler(), model]) data_inputs = np.expand_dims(np.array(list(range(10))), axis=-1) expected_outputs = np.expand_dims(np.array(list(range(10, 20))), axis=-1) for _ in range(2000): p = p.fit(data_inputs, expected_outputs) outputs = model.transform(data_inputs) assert all([ np.isclose(a, b, atol=0.1) for a, b in zip(expected_outputs, outputs) ])
def test_deep_learning_pipeline_with_random_search(): # Given data_inputs, expected_outputs = create_2d_data() p = RandomSearch(DeepLearningPipeline( SKLearnWrapper(linear_model.LinearRegression()), batch_size=BATCH_SIZE, batch_metrics={'mse': to_numpy_metric_wrapper(mean_squared_error)}, shuffle_in_each_epoch_at_train=True, n_epochs=N_EPOCHS, epochs_metrics={'mse': to_numpy_metric_wrapper(mean_squared_error)}, scoring_function=to_numpy_metric_wrapper(mean_squared_error), validation_size=0.15), n_iter=N_ITER) # When p, outputs = p.fit_transform(data_inputs, expected_outputs) best_model = p.get_best_model() best_model.set_train(False) best_model.apply('disable_metrics') # Then outputs = best_model.transform(data_inputs) mse = ((outputs - expected_outputs)**2).mean() assert mse < 2
def test_sklearn_wrapper_transform_partial_fit_classifier(): data_inputs = np.array([[0, 1], [0, 0], [3, -2], [-1, 1], [-2, 1], [2, 0], [2, -1], [4, -2], [-3, 1], [-1, 0]]) expected_outputs = np.ravel( np.expand_dims(data_inputs[:, 0] + 2 * data_inputs[:, 1] + 1, axis=-1)) classes = np.array([0, 1, 2, 3]) model = SKLearnWrapper(SGDClassifier(), use_partial_fit=True, partial_fit_kwargs={'classes': classes}) p = Pipeline([DataShuffler(), model]) for _ in range(2000): p = p.fit(data_inputs, expected_outputs) outputs = model.transform(data_inputs) assert outputs.shape == (10, ) assert len(set(outputs) - set(classes)) == 0
def test_deep_learning_pipeline(): # Given data_inputs, expected_outputs = create_2d_data() p = DeepLearningPipeline( SKLearnWrapper(linear_model.LinearRegression()), validation_size=VALIDATION_SIZE, batch_size=BATCH_SIZE, batch_metrics={'mse': to_numpy_metric_wrapper(mean_squared_error)}, shuffle_in_each_epoch_at_train=True, n_epochs=N_EPOCHS, epochs_metrics={'mse': to_numpy_metric_wrapper(mean_squared_error)}, scoring_function=to_numpy_metric_wrapper(mean_squared_error), ) # When p, outputs = p.fit_transform(data_inputs, expected_outputs) metrics = p.apply('get_metrics') # Then batch_mse_train = metrics[ 'DeepLearningPipeline__EpochRepeater__validation_split_wrapper__epoch_metrics'][ 'train']['mse'] epoch_mse_train = metrics[ 'DeepLearningPipeline__EpochRepeater__validation_split_wrapper__epoch_metrics__TrainShuffled__MiniBatchSequentialPipeline__batch_metrics'][ 'train']['mse'] batch_mse_validation = metrics[ 'DeepLearningPipeline__EpochRepeater__validation_split_wrapper__epoch_metrics__TrainShuffled__MiniBatchSequentialPipeline__batch_metrics'][ 'validation']['mse'] epoch_mse_validation = metrics[ 'DeepLearningPipeline__EpochRepeater__validation_split_wrapper__epoch_metrics'][ 'validation']['mse'] assert len(epoch_mse_train) == N_EPOCHS assert len(epoch_mse_validation) == N_EPOCHS expected_len_batch_mse = math.ceil( (len(data_inputs) / BATCH_SIZE) * (1 - VALIDATION_SIZE)) * N_EPOCHS assert len(batch_mse_train) == expected_len_batch_mse assert len(batch_mse_validation) == expected_len_batch_mse last_batch_mse_validation = batch_mse_validation[-1] last_batch_mse_train = batch_mse_train[-1] last_epoch_mse_train = epoch_mse_train[-1] last_epoch_mse_validation = epoch_mse_validation[-1] assert last_batch_mse_train < last_batch_mse_validation assert last_epoch_mse_train < last_epoch_mse_validation
def test_automl_sklearn_model_with_base_estimator(tmpdir): grad_boost = GradientBoostingRegressor() bagged_regressor = BaggingRegressor(grad_boost, random_state=5, n_jobs=-1) wrapped_bagged_regressor = SKLearnWrapper( bagged_regressor, HyperparameterSpace({ "n_estimators": RandInt(10, 100), "max_features": Uniform(0.6, 1.0) }), # return_all_sklearn_default_params_on_get=True ) _test_within_auto_ml_loop(tmpdir, wrapped_bagged_regressor)
def test_pipeline_tosklearn(): import sklearn.pipeline the_step = SomeStep() step_to_check = the_step.tosklearn() p = Pipeline([ ("a", SomeStep()), ("b", SKLearnWrapper(sklearn.pipeline.Pipeline([ ("a", sklearn.pipeline.Pipeline([ ('z', step_to_check) ])), ("b", SomeStep().tosklearn()), ("c", SomeStep().tosklearn()) ]), return_all_sklearn_default_params_on_get=True)), ("c", SomeStep()) ]) # assert False p.set_hyperparams({ "b": { "a__z__learning_rate": 7, "b__learning_rate": 9 } }) assert the_step.get_hyperparams()["learning_rate"] == 7 p = p.tosklearn() p = sklearn.pipeline.Pipeline([('sk', p)]) p.set_params(**{"sk__b__a__z__learning_rate": 11}) sk_ = p.named_steps["sk"] b_ = sk_.p["b"] predictor = b_.wrapped_sklearn_predictor a_ = predictor.named_steps["a"] z_ = a_["z"] assert z_.get_params()["learning_rate"] == 11 p.set_params(**nested_dict_to_flat({ "sk__b": { "a__z__learning_rate": 12, "b__learning_rate": 9 } })) # p.set_params(**{"sk__b__a__z__learning_rate": 12}) assert p.named_steps["sk"].p["b"].wrapped_sklearn_predictor.named_steps["a"]["z"].get_params()[ "learning_rate"] == 12 print(the_step.get_hyperparams())
def test_sklearn_wrapper_update_hyperparams(): p = SKLearnWrapper(PCA()) p.set_hyperparams( HyperparameterSamples({ 'n_components': 2, 'svd_solver': 'full' })) p.update_hyperparams(HyperparameterSamples({'n_components': 4})) assert p.wrapped_sklearn_predictor.n_components == 4 assert p.wrapped_sklearn_predictor.svd_solver == 'full'
from sklearn.utils import shuffle from neuraxle.pipeline import Pipeline from neuraxle.steps.numpy import NumpyShapePrinter from neuraxle.steps.sklearn import SKLearnWrapper, RidgeModelStacking from neuraxle.union import AddFeatures boston = load_boston() X, y = shuffle(boston.data, boston.target, random_state=13) X = X.astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False) p = Pipeline([ NumpyShapePrinter(), AddFeatures([ SKLearnWrapper(PCA(n_components=2)), SKLearnWrapper(FastICA(n_components=2)), ]), NumpyShapePrinter(), RidgeModelStacking([ SKLearnWrapper(GradientBoostingRegressor()), SKLearnWrapper(GradientBoostingRegressor(n_estimators=500)), SKLearnWrapper(GradientBoostingRegressor(max_depth=5)), SKLearnWrapper(KMeans()), ]), NumpyShapePrinter(), ]) print("Fitting on train:") p = p.fit(X_train, y_train) print("")
def test_deep_learning_pipeline(): # Given boston = load_boston() data_inputs, expected_outputs = shuffle(boston.data, boston.target, random_state=13) expected_outputs = expected_outputs.astype(np.float32) data_inputs = data_inputs.astype(np.float32) pipeline = Pipeline([ AddFeatures([ SKLearnWrapper( PCA(n_components=2), HyperparameterSpace({"n_components": RandInt(1, 3)})), SKLearnWrapper( FastICA(n_components=2), HyperparameterSpace({"n_components": RandInt(1, 3)})), ]), ModelStacking( [ SKLearnWrapper( GradientBoostingRegressor(), HyperparameterSpace({ "n_estimators": RandInt(50, 600), "max_depth": RandInt(1, 10), "learning_rate": LogUniform(0.07, 0.7) })), SKLearnWrapper( KMeans(n_clusters=7), HyperparameterSpace({"n_clusters": RandInt(5, 10)})), ], joiner=NumpyTranspose(), judge=SKLearnWrapper( Ridge(), HyperparameterSpace({ "alpha": LogUniform(0.7, 1.4), "fit_intercept": Boolean() })), ) ]) p = DeepLearningPipeline( pipeline, validation_size=VALIDATION_SIZE, batch_size=BATCH_SIZE, batch_metrics={'mse': to_numpy_metric_wrapper(mean_squared_error)}, shuffle_in_each_epoch_at_train=True, n_epochs=N_EPOCHS, epochs_metrics={'mse': to_numpy_metric_wrapper(mean_squared_error)}, scoring_function=to_numpy_metric_wrapper(mean_squared_error), ) # When p, outputs = p.fit_transform(data_inputs, expected_outputs) # Then batch_mse_train = p.get_batch_metric_train('mse') epoch_mse_train = p.get_epoch_metric_train('mse') batch_mse_validation = p.get_batch_metric_validation('mse') epoch_mse_validation = p.get_epoch_metric_validation('mse') assert len(epoch_mse_train) == N_EPOCHS assert len(epoch_mse_validation) == N_EPOCHS expected_len_batch_mse_train = math.ceil( (len(data_inputs) / BATCH_SIZE) * (1 - VALIDATION_SIZE)) * N_EPOCHS expected_len_batch_mse_validation = math.ceil( (len(data_inputs) / BATCH_SIZE) * VALIDATION_SIZE) * N_EPOCHS assert len(batch_mse_train) == expected_len_batch_mse_train assert len(batch_mse_validation) == expected_len_batch_mse_validation last_batch_mse_validation = batch_mse_validation[-1] last_batch_mse_train = batch_mse_train[-1] last_epoch_mse_train = epoch_mse_train[-1] last_epoch_mse_validation = epoch_mse_validation[-1] assert last_batch_mse_train < last_batch_mse_validation assert last_epoch_mse_train < last_epoch_mse_validation assert last_batch_mse_train < 1 assert last_epoch_mse_train < 1
def main(tmpdir): boston = load_boston() X, y = shuffle(boston.data, boston.target, random_state=13) X = X.astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False) # Note that the hyperparameter spaces are defined here during the pipeline definition, but it could be already set # within the classes ar their definition if using custom classes, or also it could be defined after declaring the # pipeline using a flat dict or a nested dict. p = Pipeline([ AddFeatures([ SKLearnWrapper( PCA(n_components=2), HyperparameterSpace({"n_components": RandInt(1, 3)}) ), SKLearnWrapper( FastICA(n_components=2), HyperparameterSpace({"n_components": RandInt(1, 3)}) ), ]), ModelStacking([ SKLearnWrapper( GradientBoostingRegressor(), HyperparameterSpace({ "n_estimators": RandInt(50, 300), "max_depth": RandInt(1, 4), "learning_rate": LogUniform(0.07, 0.7) }) ), SKLearnWrapper( KMeans(), HyperparameterSpace({"n_clusters": RandInt(5, 10)}) ), ], joiner=NumpyTranspose(), judge=SKLearnWrapper( Ridge(), HyperparameterSpace({"alpha": LogUniform(0.7, 1.4), "fit_intercept": Boolean()}) ), ) ]) print("Meta-fitting on train:") auto_ml = AutoML( p, validation_splitter=ValidationSplitter(0.20), refit_trial=True, n_trials=10, epochs=1, # 1 epoc here due to using sklearn models that just fit once. cache_folder_when_no_handle=str(tmpdir), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False)], hyperparams_repository=InMemoryHyperparamsRepository(cache_folder=str(tmpdir)) ) random_search = auto_ml.fit(X_train, y_train) p = random_search.get_best_model() print("") print("Transforming train and test:") y_train_predicted = p.predict(X_train) y_test_predicted = p.predict(X_test) print("") print("Evaluating transformed train:") score_transform = r2_score(y_train_predicted, y_train) print('R2 regression score:', score_transform) print("") print("Evaluating transformed test:") score_test = r2_score(y_test_predicted, y_test) print('R2 regression score:', score_test)
def test_sklearn_wrapper_with_an_invalid_step(): with pytest.raises(ValueError): SKLearnWrapper(Identity())
def main(): # Define classification models, and hyperparams. # See also HyperparameterSpace documentation : https://www.neuraxle.org/stable/api/neuraxle.hyperparams.space.html#neuraxle.hyperparams.space.HyperparameterSpace decision_tree_classifier = SKLearnWrapper( DecisionTreeClassifier(), HyperparameterSpace({ 'criterion': Choice(['gini', 'entropy']), 'splitter': Choice(['best', 'random']), 'min_samples_leaf': RandInt(2, 5), 'min_samples_split': RandInt(2, 4) })) extra_tree_classifier = SKLearnWrapper( ExtraTreeClassifier(), HyperparameterSpace({ 'criterion': Choice(['gini', 'entropy']), 'splitter': Choice(['best', 'random']), 'min_samples_leaf': RandInt(2, 5), 'min_samples_split': RandInt(2, 4) })) ridge_classifier = Pipeline([ OutputTransformerWrapper(NumpyRavel()), SKLearnWrapper( RidgeClassifier(), HyperparameterSpace({ 'alpha': Choice([0.0, 1.0, 10.0, 100.0]), 'fit_intercept': Boolean(), 'normalize': Boolean() })) ]).set_name('RidgeClassifier') logistic_regression = Pipeline([ OutputTransformerWrapper(NumpyRavel()), SKLearnWrapper( LogisticRegression(), HyperparameterSpace({ 'C': LogUniform(0.01, 10.0), 'fit_intercept': Boolean(), 'penalty': Choice(['none', 'l2']), 'max_iter': RandInt(20, 200) })) ]).set_name('LogisticRegression') random_forest_classifier = Pipeline([ OutputTransformerWrapper(NumpyRavel()), SKLearnWrapper( RandomForestClassifier(), HyperparameterSpace({ 'n_estimators': RandInt(50, 600), 'criterion': Choice(['gini', 'entropy']), 'min_samples_leaf': RandInt(2, 5), 'min_samples_split': RandInt(2, 4), 'bootstrap': Boolean() })) ]).set_name('RandomForestClassifier') # Define a classification pipeline that lets the AutoML loop choose one of the classifier. # See also ChooseOneStepOf documentation : https://www.neuraxle.org/stable/api/neuraxle.steps.flow.html#neuraxle.steps.flow.ChooseOneStepOf pipeline = Pipeline([ ChooseOneStepOf([ decision_tree_classifier, extra_tree_classifier, ridge_classifier, logistic_regression, random_forest_classifier ]) ]) # Create the AutoML loop object. # See also AutoML documentation : https://www.neuraxle.org/stable/api/neuraxle.metaopt.auto_ml.html#neuraxle.metaopt.auto_ml.AutoML auto_ml = AutoML( pipeline=pipeline, hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), validation_splitter=ValidationSplitter(test_size=0.20), scoring_callback=ScoringCallback(accuracy_score, higher_score_is_better=True), n_trials=7, epochs=1, hyperparams_repository=HyperparamsJSONRepository(cache_folder='cache'), refit_trial=True, continue_loop_on_error=False) # Load data, and launch AutoML loop ! X_train, y_train, X_test, y_test = generate_classification_data() auto_ml = auto_ml.fit(X_train, y_train) # Get the model from the best trial, and make predictions using predict. # See also predict documentation : https://www.neuraxle.org/stable/api/neuraxle.base.html#neuraxle.base.BaseStep.predict best_pipeline = auto_ml.get_best_model() y_pred = best_pipeline.predict(X_test) accuracy = accuracy_score(y_true=y_test, y_pred=y_pred) print("Test accuracy score:", accuracy) shutil.rmtree('cache')
def test_automl_sklearn(tmpdir): grad_boost = SKLearnWrapper(GradientBoostingRegressor()) _test_within_auto_ml_loop(tmpdir, grad_boost)
def test_sklearn_wrapper_set_hyperparams(): p = SKLearnWrapper(PCA()) p.set_hyperparams(HyperparameterSamples({'n_components': 2})) assert p.wrapped_sklearn_predictor.n_components == 2
from neuraxle.union import AddFeatures, ModelStacking boston = load_boston() X, y = shuffle(boston.data, boston.target, random_state=13) X = X.astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False) # Note that the hyperparameter spaces are defined here during the pipeline definition, but it could be already set # within the classes ar their definition if using custom classes, or also it could be defined after declaring the # pipeline using a flat dict or a nested dict. p = Pipeline([ AddFeatures([ SKLearnWrapper(PCA(n_components=2), HyperparameterSpace({"n_components": RandInt(1, 3)})), SKLearnWrapper(FastICA(n_components=2), HyperparameterSpace({"n_components": RandInt(1, 3)})), ]), ModelStacking( [ SKLearnWrapper( GradientBoostingRegressor(), HyperparameterSpace({ "n_estimators": RandInt(50, 600), "max_depth": RandInt(1, 10), "learning_rate": LogUniform(0.07, 0.7) })), SKLearnWrapper( GradientBoostingRegressor(), HyperparameterSpace({
def main(): boston = load_boston() X, y = shuffle(boston.data, boston.target, random_state=13) X = X.astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False) # Note that the hyperparameter spaces are defined here during the pipeline definition, but it could be already set # within the classes ar their definition if using custom classes, or also it could be defined after declaring the # pipeline using a flat dict or a nested dict. p = Pipeline([ AddFeatures([ SKLearnWrapper( PCA(n_components=2), HyperparameterSpace({"n_components": RandInt(1, 3)})), SKLearnWrapper( FastICA(n_components=2), HyperparameterSpace({"n_components": RandInt(1, 3)})), ]), ModelStacking( [ SKLearnWrapper( GradientBoostingRegressor(), HyperparameterSpace({ "n_estimators": RandInt(50, 600), "max_depth": RandInt(1, 10), "learning_rate": LogUniform(0.07, 0.7) })), SKLearnWrapper( KMeans(), HyperparameterSpace({"n_clusters": RandInt(5, 10)})), ], joiner=NumpyTranspose(), judge=SKLearnWrapper( Ridge(), HyperparameterSpace({ "alpha": LogUniform(0.7, 1.4), "fit_intercept": Boolean() })), ) ]) print("Meta-fitting on train:") p = p.meta_fit(X_train, y_train, metastep=RandomSearch( n_iter=10, higher_score_is_better=True, validation_technique=KFoldCrossValidationWrapper( scoring_function=r2_score, k_fold=10))) # Here is an alternative way to do it, more "pipeliney": # p = RandomSearch( # p, # n_iter=15, # higher_score_is_better=True, # validation_technique=KFoldCrossValidation(scoring_function=r2_score, k_fold=3) # ).fit(X_train, y_train) print("") print("Transforming train and test:") y_train_predicted = p.predict(X_train) y_test_predicted = p.predict(X_test) print("") print("Evaluating transformed train:") score_transform = r2_score(y_train_predicted, y_train) print('R2 regression score:', score_transform) print("") print("Evaluating transformed test:") score_test = r2_score(y_test_predicted, y_test) print('R2 regression score:', score_test)