def test_predict_should_transform_with_initial_is_train_mode_after_predict(): tape_fit = TapeCallbackFunction() tape_transform = TapeCallbackFunction() p = Pipeline([ TestOnlyWrapper( CallbackWrapper(MultiplyByN(2), tape_transform, tape_fit)), TrainOnlyWrapper( CallbackWrapper(MultiplyByN(4), tape_transform, tape_fit)) ]) p.predict(np.array([1, 1])) outputs = p.transform(np.array([1, 1])) assert np.array_equal(outputs, np.array([4, 4]))
def main(): boston = load_boston() X, y = shuffle(boston.data, boston.target, random_state=13) X = X.astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False) p = Pipeline([ NumpyShapePrinter(), AddFeatures([ PCA(n_components=2), FastICA(n_components=2), ]), NumpyShapePrinter(), RidgeModelStacking([ GradientBoostingRegressor(), GradientBoostingRegressor(n_estimators=500), GradientBoostingRegressor(max_depth=5), KMeans(), ]), NumpyShapePrinter(), ]) print("Fitting on train:") p = p.fit(X_train, y_train) print("") print("Transforming train and test:") y_train_predicted = p.predict(X_train) y_test_predicted = p.predict(X_test) print("") print("Evaluating transformed train:") score_train = r2_score(y_train_predicted, y_train) print('R2 regression score:', score_train) print("") print("Evaluating transformed test:") score_test = r2_score(y_test_predicted, y_test) print('R2 regression score:', score_test) assert y_train_predicted.shape == (379, ) assert y_test_predicted.shape == (127, ) assert isinstance(score_train, float) assert isinstance(score_test, float) return y_train_predicted, y_test_predicted, score_train, score_test
def test_predict_should_predict_in_test_mode(): tape_fit = TapeCallbackFunction() tape_transform = TapeCallbackFunction() p = Pipeline([ TestOnlyWrapper( CallbackWrapper(MultiplyByN(2), tape_transform, tape_fit)), TrainOnlyWrapper( CallbackWrapper(MultiplyByN(4), tape_transform, tape_fit)) ]) outputs = p.predict(np.array([1, 1])) assert np.array_equal(outputs, np.array([2, 2]))
def main(): boston = load_boston() X, y = shuffle(boston.data, boston.target, random_state=13) X = X.astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False) # Note that the hyperparameter spaces are defined here during the pipeline definition, but it could be already set # within the classes ar their definition if using custom classes, or also it could be defined after declaring the # pipeline using a flat dict or a nested dict. p = Pipeline([ AddFeatures([ SKLearnWrapper( PCA(n_components=2), HyperparameterSpace({"n_components": RandInt(1, 3)})), SKLearnWrapper( FastICA(n_components=2), HyperparameterSpace({"n_components": RandInt(1, 3)})), ]), ModelStacking( [ SKLearnWrapper( GradientBoostingRegressor(), HyperparameterSpace({ "n_estimators": RandInt(50, 600), "max_depth": RandInt(1, 10), "learning_rate": LogUniform(0.07, 0.7) })), SKLearnWrapper( KMeans(), HyperparameterSpace({"n_clusters": RandInt(5, 10)})), ], joiner=NumpyTranspose(), judge=SKLearnWrapper( Ridge(), HyperparameterSpace({ "alpha": LogUniform(0.7, 1.4), "fit_intercept": Boolean() })), ) ]) print("Meta-fitting on train:") p = p.meta_fit(X_train, y_train, metastep=RandomSearch( n_iter=10, higher_score_is_better=True, validation_technique=KFoldCrossValidationWrapper( scoring_function=r2_score, k_fold=10))) # Here is an alternative way to do it, more "pipeliney": # p = RandomSearch( # p, # n_iter=15, # higher_score_is_better=True, # validation_technique=KFoldCrossValidation(scoring_function=r2_score, k_fold=3) # ).fit(X_train, y_train) print("") print("Transforming train and test:") y_train_predicted = p.predict(X_train) y_test_predicted = p.predict(X_test) print("") print("Evaluating transformed train:") score_transform = r2_score(y_train_predicted, y_train) print('R2 regression score:', score_transform) print("") print("Evaluating transformed test:") score_test = r2_score(y_test_predicted, y_test) print('R2 regression score:', score_test)
# %% # Spline features make it possible for the linear model to successfully # leverage the periodic time-related features and reduce the error from ~14% to # ~10% of the maximum demand, which is similar to what we observed with the # one-hot encoded features. # # Qualitative analysis of the impact of features on linear model predictions # -------------------------------------------------------------------------- # # Here, we want to visualize the impact of the feature engineering choices on # the time related shape of the predictions. # # To do so we consider an arbitrary time-based split to compare the predictions # on a range of held out data points. naive_linear_pipeline.fit(X.iloc[train_0], y.iloc[train_0]) naive_linear_predictions = naive_linear_pipeline.predict(X.iloc[test_0]) one_hot_linear_pipeline.fit(X.iloc[train_0], y.iloc[train_0]) one_hot_linear_predictions = one_hot_linear_pipeline.predict(X.iloc[test_0]) cyclic_cossin_linear_pipeline.fit(X.iloc[train_0], y.iloc[train_0]) cyclic_cossin_linear_predictions = cyclic_cossin_linear_pipeline.predict( X.iloc[test_0]) cyclic_spline_linear_pipeline.fit(X.iloc[train_0], y.iloc[train_0]) cyclic_spline_linear_predictions = cyclic_spline_linear_pipeline.predict( X.iloc[test_0]) # %% # We visualize those predictions by zooming on the last 96 hours (4 days) of # the test set to get some qualitative insights:
def main(tmpdir): boston = load_boston() X, y = shuffle(boston.data, boston.target, random_state=13) X = X.astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False) # Note that the hyperparameter spaces are defined here during the pipeline definition, but it could be already set # within the classes ar their definition if using custom classes, or also it could be defined after declaring the # pipeline using a flat dict or a nested dict. p = Pipeline([ AddFeatures([ SKLearnWrapper( PCA(n_components=2), HyperparameterSpace({"n_components": RandInt(1, 3)}) ), SKLearnWrapper( FastICA(n_components=2), HyperparameterSpace({"n_components": RandInt(1, 3)}) ), ]), ModelStacking([ SKLearnWrapper( GradientBoostingRegressor(), HyperparameterSpace({ "n_estimators": RandInt(50, 300), "max_depth": RandInt(1, 4), "learning_rate": LogUniform(0.07, 0.7) }) ), SKLearnWrapper( KMeans(), HyperparameterSpace({"n_clusters": RandInt(5, 10)}) ), ], joiner=NumpyTranspose(), judge=SKLearnWrapper( Ridge(), HyperparameterSpace({"alpha": LogUniform(0.7, 1.4), "fit_intercept": Boolean()}) ), ) ]) print("Meta-fitting on train:") auto_ml = AutoML( p, validation_splitter=ValidationSplitter(0.20), refit_trial=True, n_trials=10, epochs=1, # 1 epoc here due to using sklearn models that just fit once. cache_folder_when_no_handle=str(tmpdir), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False)], hyperparams_repository=InMemoryHyperparamsRepository(cache_folder=str(tmpdir)) ) random_search = auto_ml.fit(X_train, y_train) p = random_search.get_best_model() print("") print("Transforming train and test:") y_train_predicted = p.predict(X_train) y_test_predicted = p.predict(X_test) print("") print("Evaluating transformed train:") score_transform = r2_score(y_train_predicted, y_train) print('R2 regression score:', score_transform) print("") print("Evaluating transformed test:") score_test = r2_score(y_test_predicted, y_test) print('R2 regression score:', score_test)