def test_automl_early_stopping_callback(tmpdir): # TODO: fix this unit test # Given hp_repository = InMemoryHyperparamsRepository(cache_folder=str(tmpdir)) n_epochs = 60 auto_ml = AutoML( pipeline=Pipeline([ FitTransformCallbackStep().set_name('callback'), MultiplyByN(2).set_hyperparams_space( HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})), NumpyReshape(new_shape=(-1, 1)), linear_model.LinearRegression() ]), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), validation_splitter=ValidationSplitter(0.20), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False), ], n_trials=1, refit_trial=True, epochs=n_epochs, hyperparams_repository=hp_repository) # When data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) expected_outputs = data_inputs * 2 auto_ml = auto_ml.fit(data_inputs=data_inputs, expected_outputs=expected_outputs) # Then p = auto_ml.get_best_model()
def test_trainer_train(): data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) expected_outputs = data_inputs * 4 p = Pipeline([ MultiplyByN(2).set_hyperparams_space( HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})), NumpyReshape(new_shape=(-1, 1)), linear_model.LinearRegression() ]) trainer: Trainer = Trainer( epochs=10, scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), validation_splitter=ValidationSplitter(test_size=0.20)) repo_trial: Trial = trainer.train(pipeline=p, data_inputs=data_inputs, expected_outputs=expected_outputs) trained_pipeline = repo_trial.get_trained_pipeline(split_number=0) outputs = trained_pipeline.transform(data_inputs) mse = mean_squared_error(expected_outputs, outputs) assert mse < 1
def _test_trial_scores( expected_output_mult, pipeline, hyperparams_optimizer: BaseHyperparameterSelectionStrategy, tmpdir: str): hp_repository: InMemoryHyperparamsRepository = InMemoryHyperparamsRepository( cache_folder=str(tmpdir)) n_epochs = 1 n_trials = 20 auto_ml: AutoML = AutoML( pipeline=pipeline, hyperparams_optimizer=hyperparams_optimizer, validation_splitter=ValidationSplitter(0.5), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False), ], n_trials=n_trials, refit_trial=True, epochs=n_epochs, hyperparams_repository=hp_repository) # When data_inputs = np.array([0, 0]) expected_outputs = expected_output_mult * np.ones_like(data_inputs) auto_ml.fit(data_inputs=data_inputs, expected_outputs=expected_outputs) # Then trials: Trials = hp_repository.load_all_trials(status=TRIAL_STATUS.SUCCESS) validation_scores = [t.get_validation_score() for t in trials] return validation_scores
def test_automl_should_shallow_copy_data_before_each_epoch(): # see issue #332 https://github.com/Neuraxio/Neuraxle/issues/332 data_inputs = np.random.randint(0, 100, (100, 3)) expected_outputs = np.random.randint(0, 3, 100) from sklearn.preprocessing import StandardScaler p = Pipeline([ SKLearnWrapper(StandardScaler()), SKLearnWrapper(LinearSVC(), HyperparameterSpace({'C': RandInt(0, 10000)})), ]) auto_ml = AutoML(p, validation_splitter=ValidationSplitter(0.20), refit_trial=True, n_trials=10, epochs=10, cache_folder_when_no_handle='cache', scoring_callback=ScoringCallback( mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False) ], hyperparams_repository=InMemoryHyperparamsRepository( cache_folder='cache'), continue_loop_on_error=False) random_search = auto_ml.fit(data_inputs, expected_outputs) best_model = random_search.get_best_model() assert isinstance(best_model, Pipeline)
def _make_autoML_loop(tmpdir, p: Pipeline): hp_repository = InMemoryHyperparamsRepository(cache_folder=str(tmpdir) + "_hp") n_epochs = 1 return AutoML( pipeline=p, hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), validation_splitter=ValidationSplitter(0.20), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), n_trials=1, refit_trial=True, epochs=n_epochs, hyperparams_repository=hp_repository, cache_folder_when_no_handle=str(tmpdir), continue_loop_on_error=False )
def test_logger_automl(self, tmpdir): # Given context = ExecutionContext() self.tmpdir = str(tmpdir) hp_repository = HyperparamsJSONRepository(cache_folder=self.tmpdir) n_epochs = 2 n_trials = 4 auto_ml = AutoML( pipeline=Pipeline([ MultiplyByN(2).set_hyperparams_space( HyperparameterSpace( {'multiply_by': FixedHyperparameter(2)})), NumpyReshape(new_shape=(-1, 1)), LoggingStep() ]), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy( ), validation_splitter=ValidationSplitter(0.20), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), n_trials=n_trials, refit_trial=True, epochs=n_epochs, hyperparams_repository=hp_repository, continue_loop_on_error=False) # When data_container = DataContainer( data_inputs=np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), expected_outputs=np.array([10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0])) auto_ml.handle_fit(data_container, context) # Then file_paths = [ os.path.join(hp_repository.cache_folder, f"trial_{i}.log") for i in range(n_trials) ] assert len(file_paths) == n_trials for f in file_paths: assert os.path.exists(f) for f in file_paths: with open(f, 'r') as f: log = f.readlines() assert len(log) == 36
def test_automl_savebestmodel_callback(tmpdir): # Given hp_repository = HyperparamsJSONRepository(cache_folder=str('caching')) validation_splitter = ValidationSplitter(0.20) auto_ml = AutoML( pipeline=Pipeline([ MultiplyByN(2).set_hyperparams_space(HyperparameterSpace({ 'multiply_by': FixedHyperparameter(2) })), NumpyReshape(new_shape=(-1, 1)), linear_model.LinearRegression() ]), validation_splitter=validation_splitter, hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ BestModelCheckpoint() ], n_trials=1, epochs=10, refit_trial=False, print_func=print, hyperparams_repository=hp_repository, continue_loop_on_error=False ) data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) expected_outputs = data_inputs * 4 # When auto_ml.fit(data_inputs=data_inputs, expected_outputs=expected_outputs) #Then trials: Trials = hp_repository.load_all_trials() best_trial = trials.get_best_trial() best_trial_score = best_trial.get_validation_score() best_trial.cache_folder = hp_repository.cache_folder best_model = best_trial.get_model('best') _, _, valid_inputs, valid_outputs = ValidationSplitter(0.20).split(data_inputs, expected_outputs) predicted_output = best_model.predict(valid_inputs) score = mean_squared_error(valid_outputs, predicted_output) assert best_trial_score == score
def _test_within_auto_ml_loop(tmpdir, pipeline): X_train = np.random.random((25, 50)).astype(np.float32) Y_train = np.random.random((25, )).astype(np.float32) validation_splitter = KFoldCrossValidationSplitter(3) scoring_callback = ScoringCallback(median_absolute_error, higher_score_is_better=False) auto_ml = AutoML( pipeline=pipeline, hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), validation_splitter=validation_splitter, scoring_callback=scoring_callback, n_trials=2, epochs=1, hyperparams_repository=HyperparamsJSONRepository(cache_folder=tmpdir), refit_trial=True, continue_loop_on_error=False) auto_ml.fit(X_train, Y_train)
def test_automl_with_kfold(tmpdir): # Given hp_repository = HyperparamsJSONRepository(cache_folder=str('caching')) auto_ml = AutoML( pipeline=Pipeline([ MultiplyByN(2).set_hyperparams_space( HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})), NumpyReshape(new_shape=(-1, 1)), linear_model.LinearRegression() ]), validation_splitter=ValidationSplitter(0.20), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False), ], n_trials=1, epochs=10, refit_trial=True, print_func=print, hyperparams_repository=hp_repository, continue_loop_on_error=False) data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) expected_outputs = data_inputs * 4 # When auto_ml.fit(data_inputs=data_inputs, expected_outputs=expected_outputs) # Then p = auto_ml.get_best_model() outputs = p.transform(data_inputs) mse = mean_squared_error(expected_outputs, outputs) assert mse < 1000
def test_automl_early_stopping_callback(tmpdir): # Given hp_repository = InMemoryHyperparamsRepository(cache_folder=str(tmpdir)) n_epochs = 10 max_epochs_without_improvement = 3 auto_ml = AutoML( pipeline=Pipeline([ MultiplyByN(2).set_hyperparams_space( HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})), NumpyReshape(new_shape=(-1, 1)), ]), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), validation_splitter=ValidationSplitter(0.20), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False), EarlyStoppingCallback(max_epochs_without_improvement) ], n_trials=1, refit_trial=True, epochs=n_epochs, hyperparams_repository=hp_repository, continue_loop_on_error=False) # When data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) expected_outputs = data_inputs * 2 auto_ml.fit(data_inputs=data_inputs, expected_outputs=expected_outputs) # Then trial = hp_repository.trials[0] assert len(trial.validation_splits) == 1 validation_scores = trial.validation_splits[0].get_validation_scores() nepochs_executed = len(validation_scores) assert nepochs_executed == max_epochs_without_improvement + 1
def main(tmpdir, sleep_time: float = 0.001, n_iter: int = 10): DATA_INPUTS = np.array(range(100)) EXPECTED_OUTPUTS = np.array(range(100, 200)) HYPERPARAMETER_SPACE = HyperparameterSpace({ 'multiplication_1__multiply_by': RandInt(1, 2), 'multiplication_2__multiply_by': RandInt(1, 2), 'multiplication_3__multiply_by': RandInt(1, 2), }) print('Classic Pipeline:') classic_pipeline_folder = os.path.join(str(tmpdir), 'classic') pipeline = Pipeline([ ('multiplication_1', MultiplyByN()), ('sleep_1', ForEachDataInput(Sleep(sleep_time))), ('multiplication_2', MultiplyByN()), ('sleep_2', ForEachDataInput(Sleep(sleep_time))), ('multiplication_3', MultiplyByN()), ], cache_folder=classic_pipeline_folder).set_hyperparams_space(HYPERPARAMETER_SPACE) time_a = time.time() auto_ml = AutoML( pipeline, refit_trial=True, n_trials=n_iter, cache_folder_when_no_handle=classic_pipeline_folder, validation_splitter=ValidationSplitter(0.2), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False) ], ) auto_ml = auto_ml.fit(DATA_INPUTS, EXPECTED_OUTPUTS) outputs = auto_ml.get_best_model().predict(DATA_INPUTS) time_b = time.time() actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs) print('{0} seconds'.format(time_b - time_a)) print('output: {0}'.format(outputs)) print('smallest mse: {0}'.format(actual_score)) print('best hyperparams: {0}'.format(pipeline.get_hyperparams())) assert isinstance(actual_score, float) print('Resumable Pipeline:') resumable_pipeline_folder = os.path.join(str(tmpdir), 'resumable') pipeline = ResumablePipeline([ ('multiplication_1', MultiplyByN()), ('ForEach(sleep_1)', ForEachDataInput(Sleep(sleep_time))), ('checkpoint1', ExpandDim(DefaultCheckpoint())), ('multiplication_2', MultiplyByN()), ('sleep_2', ForEachDataInput(Sleep(sleep_time))), ('checkpoint2', ExpandDim(DefaultCheckpoint())), ('multiplication_3', MultiplyByN()) ], cache_folder=resumable_pipeline_folder).set_hyperparams_space(HYPERPARAMETER_SPACE) time_a = time.time() auto_ml = AutoML( pipeline, refit_trial=True, n_trials=n_iter, cache_folder_when_no_handle=resumable_pipeline_folder, validation_splitter=ValidationSplitter(0.2), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False) ] ) auto_ml = auto_ml.fit(DATA_INPUTS, EXPECTED_OUTPUTS) outputs = auto_ml.get_best_model().predict(DATA_INPUTS) time_b = time.time() pipeline.flush_all_cache() actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs) print('{0} seconds'.format(time_b - time_a)) print('output: {0}'.format(outputs)) print('smallest mse: {0}'.format(actual_score)) print('best hyperparams: {0}'.format(pipeline.get_hyperparams())) assert isinstance(actual_score, float)
def main(chosen_device): exercice_number = 1 print('exercice {}\n=================='.format(exercice_number)) data_inputs, expected_outputs = generate_data( # See: https://github.com/guillaume-chevalier/seq2seq-signal-prediction/blob/master/datasets.py exercice_number=exercice_number, n_samples=None, window_size_past=None, window_size_future=None) print('data_inputs shape: {} => (n_samples, window_size_past, input_dim)'. format(data_inputs.shape)) print( 'expected_outputs shape: {} => (n_samples, window_size_future, output_dim)' .format(expected_outputs.shape)) sequence_length = data_inputs.shape[1] input_dim = data_inputs.shape[2] output_dim = expected_outputs.shape[2] batch_size = 100 epochs = 3 validation_size = 0.15 max_plotted_validation_predictions = 10 seq2seq_pipeline_hyperparams = HyperparameterSamples({ 'hidden_dim': 100, 'layers_stacked_count': 2, 'lambda_loss_amount': 0.0003, 'learning_rate': 0.006, 'window_size_future': sequence_length, 'output_dim': output_dim, 'input_dim': input_dim }) pipeline = Pipeline([ MiniBatchSequentialPipeline( [ ForEachDataInput(MeanStdNormalizer()), ToNumpy(), PlotPredictionsWrapper( Tensorflow2ModelStep( # See: https://github.com/Neuraxio/Neuraxle-TensorFlow create_model=create_model, create_loss=create_loss, create_optimizer=create_optimizer, expected_outputs_dtype=tf.dtypes.float32, data_inputs_dtype=tf.dtypes.float32, device_name=chosen_device, print_loss=True).set_hyperparams( seq2seq_pipeline_hyperparams)) ], batch_size=batch_size), ]).set_name('SignalPrediction') trainer = Trainer( epochs=epochs, validation_splitter=ValidationSplitter(test_size=validation_size), scoring_callback=ScoringCallback( metric_function=metric_3d_to_2d_wrapper(mean_squared_error), higher_score_is_better=False)) trial: Trial = trainer.train(pipeline=pipeline, data_inputs=data_inputs, expected_outputs=expected_outputs) plot_metrics( metric_name='mse', train_values=trial.validation_splits[0].metrics_results['main'] ['train_values'], validation_values=trial.validation_splits[0].metrics_results['main'] ['validation_values'], exercice_number=exercice_number) # Get trained pipeline pipeline = trial.get_trained_pipeline(split_number=0) # Get validation set with trainer.validation_split_function.split function. _, _, data_inputs_validation, expected_outputs_validation = trainer.validation_split_function.split( data_inputs=data_inputs, expected_outputs=expected_outputs) # Enable the plotting feature inside the PlotPredictionsWrapper wrapper step. pipeline.apply('toggle_plotting') pipeline.apply(method='set_max_plotted_predictions', max_plotted_predictions=max_plotted_validation_predictions) # Transform the trained pipeline to plot predictions pipeline.transform_data_container( DataContainer(data_inputs=data_inputs_validation[0], expected_outputs=expected_outputs_validation[0]))
def main(): def accuracy(data_inputs, expected_outputs): return np.mean( np.argmax(np.array(data_inputs), axis=1) == np.argmax( np.array(expected_outputs), axis=1)) # load the dataset df = read_csv('data/winequality-white.csv', sep=';') data_inputs = df.values data_inputs[:, -1] = data_inputs[:, -1] - 1 n_features = data_inputs.shape[1] - 1 n_classes = 10 p = Pipeline([ TrainOnlyWrapper(DataShuffler()), ColumnTransformerInputOutput( input_columns=[( [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], ToNumpy(np.float32) )], output_columns=[(11, Identity())] ), OutputTransformerWrapper(PlotDistribution(column=-1)), MiniBatchSequentialPipeline([ Tensorflow2ModelStep( create_model=create_model, create_loss=create_loss, create_optimizer=create_optimizer ) \ .set_hyperparams(HyperparameterSamples({ 'n_dense_layers': 2, 'input_dim': n_features, 'optimizer': 'adam', 'activation': 'relu', 'kernel_initializer': 'he_uniform', 'learning_rate': 0.01, 'hidden_dim': 20, 'n_classes': 3 })).set_hyperparams_space(HyperparameterSpace({ 'n_dense_layers': RandInt(2, 4), 'hidden_dim_layer_multiplier': Uniform(0.30, 1), 'input_dim': FixedHyperparameter(n_features), 'optimizer': Choice([ OPTIMIZERS.ADAM.value, OPTIMIZERS.SGD.value, OPTIMIZERS.ADAGRAD.value ]), 'activation': Choice([ ACTIVATIONS.RELU.value, ACTIVATIONS.TANH.value, ACTIVATIONS.SIGMOID.value, ACTIVATIONS.ELU.value, ]), 'kernel_initializer': Choice([ KERNEL_INITIALIZERS.GLOROT_NORMAL.value, KERNEL_INITIALIZERS.GLOROT_UNIFORM.value, KERNEL_INITIALIZERS.HE_UNIFORM.value ]), 'learning_rate': LogUniform(0.005, 0.01), 'hidden_dim': RandInt(3, 80), 'n_classes': FixedHyperparameter(n_classes) })) ], batch_size=33), OutputTransformerWrapper(Pipeline([ ExpandDim(), OneHotEncoder(nb_columns=n_classes, name='classes') ])) ]) auto_ml = AutoML( pipeline=p, hyperparams_repository=InMemoryHyperparamsRepository( cache_folder='trials'), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), validation_splitter=ValidationSplitter(test_size=0.30), scoring_callback=ScoringCallback(accuracy, higher_score_is_better=True), callbacks=[ MetricCallback( name='classification_report_imbalanced_metric', metric_function=classificaiton_report_imbalanced_metric, higher_score_is_better=True), MetricCallback(name='f1', metric_function=f1_score_weighted, higher_score_is_better=True), MetricCallback(name='recall', metric_function=recall_score_weighted, higher_score_is_better=True), MetricCallback(name='precision', metric_function=precision_score_weighted, higher_score_is_better=True), EarlyStoppingCallback(max_epochs_without_improvement=3) ], n_trials=200, refit_trial=True, epochs=75) auto_ml = auto_ml.fit(data_inputs=data_inputs)
def main(): # Define classification models, and hyperparams. # See also HyperparameterSpace documentation : https://www.neuraxle.org/stable/api/neuraxle.hyperparams.space.html#neuraxle.hyperparams.space.HyperparameterSpace decision_tree_classifier = SKLearnWrapper( DecisionTreeClassifier(), HyperparameterSpace({ 'criterion': Choice(['gini', 'entropy']), 'splitter': Choice(['best', 'random']), 'min_samples_leaf': RandInt(2, 5), 'min_samples_split': RandInt(2, 4) })) extra_tree_classifier = SKLearnWrapper( ExtraTreeClassifier(), HyperparameterSpace({ 'criterion': Choice(['gini', 'entropy']), 'splitter': Choice(['best', 'random']), 'min_samples_leaf': RandInt(2, 5), 'min_samples_split': RandInt(2, 4) })) ridge_classifier = Pipeline([ OutputTransformerWrapper(NumpyRavel()), SKLearnWrapper( RidgeClassifier(), HyperparameterSpace({ 'alpha': Choice([0.0, 1.0, 10.0, 100.0]), 'fit_intercept': Boolean(), 'normalize': Boolean() })) ]).set_name('RidgeClassifier') logistic_regression = Pipeline([ OutputTransformerWrapper(NumpyRavel()), SKLearnWrapper( LogisticRegression(), HyperparameterSpace({ 'C': LogUniform(0.01, 10.0), 'fit_intercept': Boolean(), 'penalty': Choice(['none', 'l2']), 'max_iter': RandInt(20, 200) })) ]).set_name('LogisticRegression') random_forest_classifier = Pipeline([ OutputTransformerWrapper(NumpyRavel()), SKLearnWrapper( RandomForestClassifier(), HyperparameterSpace({ 'n_estimators': RandInt(50, 600), 'criterion': Choice(['gini', 'entropy']), 'min_samples_leaf': RandInt(2, 5), 'min_samples_split': RandInt(2, 4), 'bootstrap': Boolean() })) ]).set_name('RandomForestClassifier') # Define a classification pipeline that lets the AutoML loop choose one of the classifier. # See also ChooseOneStepOf documentation : https://www.neuraxle.org/stable/api/neuraxle.steps.flow.html#neuraxle.steps.flow.ChooseOneStepOf pipeline = Pipeline([ ChooseOneStepOf([ decision_tree_classifier, extra_tree_classifier, ridge_classifier, logistic_regression, random_forest_classifier ]) ]) # Create the AutoML loop object. # See also AutoML documentation : https://www.neuraxle.org/stable/api/neuraxle.metaopt.auto_ml.html#neuraxle.metaopt.auto_ml.AutoML auto_ml = AutoML( pipeline=pipeline, hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), validation_splitter=ValidationSplitter(test_size=0.20), scoring_callback=ScoringCallback(accuracy_score, higher_score_is_better=True), n_trials=7, epochs=1, hyperparams_repository=HyperparamsJSONRepository(cache_folder='cache'), refit_trial=True, continue_loop_on_error=False) # Load data, and launch AutoML loop ! X_train, y_train, X_test, y_test = generate_classification_data() auto_ml = auto_ml.fit(X_train, y_train) # Get the model from the best trial, and make predictions using predict. # See also predict documentation : https://www.neuraxle.org/stable/api/neuraxle.base.html#neuraxle.base.BaseStep.predict best_pipeline = auto_ml.get_best_model() y_pred = best_pipeline.predict(X_test) accuracy = accuracy_score(y_true=y_test, y_pred=y_pred) print("Test accuracy score:", accuracy) shutil.rmtree('cache')
def main(tmpdir): boston = load_boston() X, y = shuffle(boston.data, boston.target, random_state=13) X = X.astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False) # Note that the hyperparameter spaces are defined here during the pipeline definition, but it could be already set # within the classes ar their definition if using custom classes, or also it could be defined after declaring the # pipeline using a flat dict or a nested dict. p = Pipeline([ AddFeatures([ SKLearnWrapper( PCA(n_components=2), HyperparameterSpace({"n_components": RandInt(1, 3)}) ), SKLearnWrapper( FastICA(n_components=2), HyperparameterSpace({"n_components": RandInt(1, 3)}) ), ]), ModelStacking([ SKLearnWrapper( GradientBoostingRegressor(), HyperparameterSpace({ "n_estimators": RandInt(50, 300), "max_depth": RandInt(1, 4), "learning_rate": LogUniform(0.07, 0.7) }) ), SKLearnWrapper( KMeans(), HyperparameterSpace({"n_clusters": RandInt(5, 10)}) ), ], joiner=NumpyTranspose(), judge=SKLearnWrapper( Ridge(), HyperparameterSpace({"alpha": LogUniform(0.7, 1.4), "fit_intercept": Boolean()}) ), ) ]) print("Meta-fitting on train:") auto_ml = AutoML( p, validation_splitter=ValidationSplitter(0.20), refit_trial=True, n_trials=10, epochs=1, # 1 epoc here due to using sklearn models that just fit once. cache_folder_when_no_handle=str(tmpdir), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False)], hyperparams_repository=InMemoryHyperparamsRepository(cache_folder=str(tmpdir)) ) random_search = auto_ml.fit(X_train, y_train) p = random_search.get_best_model() print("") print("Transforming train and test:") y_train_predicted = p.predict(X_train) y_test_predicted = p.predict(X_test) print("") print("Evaluating transformed train:") score_transform = r2_score(y_train_predicted, y_train) print('R2 regression score:', score_transform) print("") print("Evaluating transformed test:") score_test = r2_score(y_test_predicted, y_test) print('R2 regression score:', score_test)