def test_regression_pandas_support(tmp_dir, output_dir, dask_client): X, y = sklearn.datasets.fetch_openml( data_id=41514, # diabetes return_X_y=True, as_frame=True, ) # This test only make sense if input is dataframe assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) automl = AutoSklearnRegressor( time_left_for_this_task=40, per_run_time_limit=5, dask_client=dask_client, tmp_folder=tmp_dir, output_folder=output_dir, ) # Make sure we error out because y is not encoded automl.fit(X, y) # Make sure that at least better than random. # We use same X_train==X_test to test code quality assert automl.score(X, y) >= 0.5, print_debug_information(automl) automl.refit(X, y) # Make sure that at least better than random. assert r2(y, automl.predict(X)) > 0.5, print_debug_information(automl) assert count_succeses( automl.cv_results_) > 0, print_debug_information(automl)
def train_autosklearn(l=None): if l is None: l = get_data() ensemble_size = 1 # 50 ... 1 for vanilla initial_configurations_via_metalearning = 0 # 25 ... 0 for vanilla model = AutoSklearnRegressor( delete_output_folder_after_terminate=True, delete_tmp_folder_after_terminate=True, disable_evaluator_output=False, ensemble_nbest=50, ensemble_size=ensemble_size, exclude_estimators=None, exclude_preprocessors=None, get_smac_object_callback=None, include_estimators=None, include_preprocessors=None, initial_configurations_via_metalearning= initial_configurations_via_metalearning, logging_config=None, ml_memory_limit=3072, output_folder=None, per_run_time_limit=360, resampling_strategy='cv', resampling_strategy_arguments={'folds': 5}, # resampling_strategy='holdout', # resampling_strategy_arguments=None, seed=1, shared_mode=False, smac_scenario_args=None, time_left_for_this_task=3600, tmp_folder=None) model.fit(l.X_train.values.copy(), l.y_train.values.squeeze().copy()) model.refit(l.X_train.values.copy(), l.y_train.values.squeeze().copy()) print(model.show_models()) return attributedict_from_locals('model')
def test_regression_pandas_support(self): X, y = sklearn.datasets.fetch_openml( data_id=41514, # diabetes return_X_y=True, as_frame=True, ) # This test only make sense if input is dataframe self.assertTrue(isinstance(X, pd.DataFrame)) self.assertTrue(isinstance(y, pd.Series)) automl = AutoSklearnRegressor( time_left_for_this_task=30, per_run_time_limit=5, ) # Make sure we error out because y is not encoded automl.fit(X, y) # Make sure that at least better than random. # We use same X_train==X_test to test code quality self.assertTrue(automl.score(X, y) > 0.5) automl.refit(X, y) # Make sure that at least better than random. self.assertTrue(r2(y, automl.predict(X)) > 0.5)
def test_conversion_of_list_to_np(self, fit_ensemble, refit, fit): automl = AutoSklearnRegressor() X = [[1], [2], [3]] y = [1, 2, 3] automl.fit(X, y) self.assertEqual(fit.call_count, 1) self.assertIsInstance(fit.call_args[0][0], np.ndarray) self.assertIsInstance(fit.call_args[0][1], np.ndarray) automl.refit(X, y) self.assertEqual(refit.call_count, 1) self.assertIsInstance(refit.call_args[0][0], np.ndarray) self.assertIsInstance(refit.call_args[0][1], np.ndarray) automl.fit_ensemble(y) self.assertEqual(fit_ensemble.call_count, 1) self.assertIsInstance(fit_ensemble.call_args[0][0], np.ndarray)
def test_conversion_of_list_to_np(self, fit_ensemble, refit, fit): automl = AutoSklearnRegressor() X = [[1], [2], [3]] y = [1, 2, 3] automl.fit(X, y) self.assertEqual(fit.call_count, 1) self.assertIsInstance(fit.call_args[0][0], np.ndarray) self.assertIsInstance(fit.call_args[0][1], np.ndarray) automl.refit(X, y) self.assertEqual(refit.call_count, 1) self.assertIsInstance(refit.call_args[0][0], np.ndarray) self.assertIsInstance(refit.call_args[0][1], np.ndarray) automl.fit_ensemble(y) self.assertEqual(fit_ensemble.call_count, 1) self.assertIsInstance(fit_ensemble.call_args[0][0], np.ndarray)
class AutoML(AbstractModel): def __init__(self): super().__init__() self.model = AutoSklearnRegressor def fit(self, x, y, modeldict=None): if not self.m: self.param_search(x, y) self.m.refit(x, y) def param_search(self, x, y, time_per_sample=3.5, **kwargs): time = int(len(y) * time_per_sample) self.m = AutoSklearnRegressor( time_left_for_this_task=time, resampling_strategy="cv", resampling_strategy_arguments={'folds': 10}) self.m.fit(x, y, metric=mean_squared_error, dataset_name="Land Use Regression") # print(self.m.sprint_statistics()) # score = score_funtion(y, self.m.predict(x)) # print("Reached a score of {}.".format(score)) kf = KFold(n_splits=10, shuffle=True) rmse = [] mae = [] r2 = [] for train_index, test_index in kf.split(x, y): X_train, X_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] self.m.refit(X_train, y_train) predictions = self.m.predict(X_test) rmse_iter, mae_iter, r2_iter = self.score_function( y_test, predictions) rmse.append(rmse_iter) mae.append(mae_iter) r2.append(r2_iter) # print("Reached a RMSE of {}, MAE of {} and R2 of {}.".format(np.mean(rmse), np.mean(mae), np.mean(r2))) return self.concat_results(np.mean(rmse), np.mean(mae), np.mean(r2))
def test_regression_methods_returns_self(self): X_train, y_train, X_test, y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor(time_left_for_this_task=20, per_run_time_limit=5, ensemble_size=0) automl_fitted = automl.fit(X_train, y_train) self.assertIs(automl, automl_fitted) automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5) self.assertIs(automl, automl_ensemble_fitted) automl_refitted = automl.refit(X_train.copy(), y_train.copy()) self.assertIs(automl, automl_refitted)
def test_regression_methods_returns_self(self): X_train, y_train, X_test, y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor(time_left_for_this_task=20, per_run_time_limit=5, ensemble_size=0) automl_fitted = automl.fit(X_train, y_train) self.assertIs(automl, automl_fitted) automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5) self.assertIs(automl, automl_ensemble_fitted) automl_refitted = automl.refit(X_train.copy(), y_train.copy()) self.assertIs(automl, automl_refitted)
def test_autosklearn_regression_methods_returns_self(dask_client): X_train, y_train, X_test, y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor(time_left_for_this_task=30, per_run_time_limit=5, dask_client=dask_client, ensemble_size=0) automl_fitted = automl.fit(X_train, y_train) assert automl is automl_fitted automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5) assert automl is automl_ensemble_fitted automl_refitted = automl.refit(X_train.copy(), y_train.copy()) assert automl is automl_refitted
#-----REGRESSION----- automl = AutoSklearnRegressor( per_run_time_limit=360, ml_memory_limit=1024 * 8, time_left_for_this_task=3600, resampling_strategy='cv', # ensemble_size=1, # initial_configurations_via_metalearning=0, resampling_strategy_arguments={'folds': 5}) start = time.time() #X_train = X_train.astype('float') # when? automl.fit(X_train, y_train, dataset_name='boston_housing') #change dataset name accordingly automl.refit(X_train.copy(), y_train.copy()) print( '[INFO] Elapsed time finding best model: {} seconds.'.format(time.time() - start)) predictions = automl.predict(X_test) #print('--- CLASSIFICATION REPORT: ---') #not for regression #print(classification_report(y_test, predictions, digits=5)) print('\n\n--- MODELS: ---') print(automl.show_models()) print('\n\n--- STATISTICS: ---') print(automl.sprint_statistics()) #-----CLASSIFIER----- #print('\n\n--- SCORE: ---') #print("Balanced error score", 1 - balanced_accuracy_score(y_test, predictions))
preprocessing_to_use = ["no_preprocessing"] # Init auto-sklearn auto_sklearn = AutoSklearnRegressor(time_left_for_this_task=60 * 5, per_run_time_limit=360, include_estimators=estimators_to_use, exclude_estimators=None, include_preprocessors=preprocessing_to_use, exclude_preprocessors=None, ml_memory_limit=6156, resampling_strategy="cv", resampling_strategy_arguments={"folds": 5}) # Train models auto_sklearn.fit(X=X_train.copy(), y=y_train.copy(), metric=mean_squared_error) it_fits = auto_sklearn.refit(X=X_train.copy(), y=y_train.copy()) # Predict y_hat = auto_sklearn.predict(X_test) # Show results auto_sklearn.cv_results_ auto_sklearn.sprint_statistics() auto_sklearn.show_models() auto_sklearn.get_models_with_weights() # TPOT from tpot import TPOTRegressor tpot_config = {