def test_cv_regression(self): """ Makes sure that when using a cv strategy, we are able to fit a regressor """ tmp = os.path.join(self.test_dir, '..', '.tmp_regression_fit') output = os.path.join(self.test_dir, '..', '.out_regression_fit') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor(time_left_for_this_task=30, per_run_time_limit=5, resampling_strategy='cv', tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (356,)) score = mean_squared_error(Y_test, predictions) # On average np.sqrt(30) away from the target -> ~5.5 on average # Results with select rates drops avg score to a range of -32.40 to -37, on 30 seconds # constraint. With more time_left_for_this_task this is no longer an issue self.assertGreaterEqual(score, -37) self._tearDown(tmp) self._tearDown(output)
def test_regression_pandas_support(tmp_dir, output_dir, dask_client): X, y = sklearn.datasets.fetch_openml( data_id=41514, # diabetes return_X_y=True, as_frame=True, ) # This test only make sense if input is dataframe assert isinstance(X, pd.DataFrame) assert isinstance(y, pd.Series) automl = AutoSklearnRegressor( time_left_for_this_task=40, per_run_time_limit=5, dask_client=dask_client, tmp_folder=tmp_dir, output_folder=output_dir, ) # Make sure we error out because y is not encoded automl.fit(X, y) # Make sure that at least better than random. # We use same X_train==X_test to test code quality assert automl.score(X, y) >= 0.5, print_debug_information(automl) automl.refit(X, y) # Make sure that at least better than random. assert r2(y, automl.predict(X)) > 0.5, print_debug_information(automl) assert count_succeses( automl.cv_results_) > 0, print_debug_information(automl)
def test_cv_regression(self): """ Makes sure that when using a cv strategy, we are able to fit a regressor """ tmp = os.path.join(self.test_dir, '..', '.tmp_regression_fit_cv') output = os.path.join(self.test_dir, '..', '.out_regression_fit_cv') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset( 'boston', train_size_maximum=300) automl = AutoSklearnRegressor(time_left_for_this_task=60, per_run_time_limit=10, resampling_strategy='cv', tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (206, )) score = r2(Y_test, predictions) print(Y_test) print(predictions) self.assertGreaterEqual(score, 0.1) self.assertGreater(self._count_succeses(automl.cv_results_), 0) self._tearDown(tmp) self._tearDown(output)
def test_regression_pandas_support(self): X, y = sklearn.datasets.fetch_openml( data_id=41514, # diabetes return_X_y=True, as_frame=True, ) # This test only make sense if input is dataframe self.assertTrue(isinstance(X, pd.DataFrame)) self.assertTrue(isinstance(y, pd.Series)) automl = AutoSklearnRegressor( time_left_for_this_task=30, per_run_time_limit=5, ) # Make sure we error out because y is not encoded automl.fit(X, y) # Make sure that at least better than random. # We use same X_train==X_test to test code quality self.assertTrue(automl.score(X, y) > 0.5) automl.refit(X, y) # Make sure that at least better than random. self.assertTrue(r2(y, automl.predict(X)) > 0.5)
def test_cv_regression(tmp_dir, output_dir, dask_client): """ Makes sure that when using a cv strategy, we are able to fit a regressor """ X_train, Y_train, X_test, Y_test = putil.get_dataset( 'boston', train_size_maximum=300) automl = AutoSklearnRegressor(time_left_for_this_task=60, per_run_time_limit=10, resampling_strategy='cv', tmp_folder=tmp_dir, dask_client=dask_client, output_folder=output_dir) automl.fit(X_train, Y_train) # Log file path log_file_path = glob.glob(os.path.join(tmp_dir, 'AutoML*.log'))[0] predictions = automl.predict(X_test) assert predictions.shape == (206, ) score = r2(Y_test, predictions) assert score >= 0.1, extract_msg_from_log(log_file_path) assert count_succeses( automl.cv_results_) > 0, extract_msg_from_log(log_file_path)
def train_autosklearn(l=None): if l is None: l = get_data() ensemble_size = 1 # 50 ... 1 for vanilla initial_configurations_via_metalearning = 0 # 25 ... 0 for vanilla model = AutoSklearnRegressor( delete_output_folder_after_terminate=True, delete_tmp_folder_after_terminate=True, disable_evaluator_output=False, ensemble_nbest=50, ensemble_size=ensemble_size, exclude_estimators=None, exclude_preprocessors=None, get_smac_object_callback=None, include_estimators=None, include_preprocessors=None, initial_configurations_via_metalearning= initial_configurations_via_metalearning, logging_config=None, ml_memory_limit=3072, output_folder=None, per_run_time_limit=360, resampling_strategy='cv', resampling_strategy_arguments={'folds': 5}, # resampling_strategy='holdout', # resampling_strategy_arguments=None, seed=1, shared_mode=False, smac_scenario_args=None, time_left_for_this_task=3600, tmp_folder=None) model.fit(l.X_train.values.copy(), l.y_train.values.squeeze().copy()) model.refit(l.X_train.values.copy(), l.y_train.values.squeeze().copy()) print(model.show_models()) return attributedict_from_locals('model')
def spawn_regressor( seed, time, search_space, prep_space, metric, dataset_name=None): """Spawn a subprocess. auto-sklearn does not take care of spawning worker processes. This function, which is called several times in the main block is a new process which runs one instance of auto-sklearn. """ # Use the initial configurations from meta-learning only in one out of # the four processes spawned. This prevents auto-sklearn from evaluating # the same configurations in four processes. if seed == 0: initial_configurations_via_metalearning = 25 smac_scenario_args = {} else: initial_configurations_via_metalearning = 0 smac_scenario_args = {'initial_incumbent': 'RANDOM'} # Arguments which are different to other runs of auto-sklearn: # 1. all classifiers write to the same output directory # 2. shared_mode is set to True, this enables sharing of data between # models. # 3. all instances of the AutoSklearnClassifier must have a different # seed! automl = AutoSklearnRegressor( time_left_for_this_task=time, # sec., how long should this seed fit process run per_run_time_limit=15, # sec., each model may only take this long before it's killed ml_memory_limit=1024, # MB, memory limit imposed on each call to a ML algorithm shared_mode=True, # tmp folder will be shared between seeds tmp_folder=tmp_folder, output_folder=output_folder, delete_tmp_folder_after_terminate=False, ensemble_size=0, include_estimators=search_space, exclude_estimators=None, include_preprocessors=prep_space, exclude_preprocessors=None, # ensembles will be built when all optimization runs are finished initial_configurations_via_metalearning=( initial_configurations_via_metalearning ), seed=seed, smac_scenario_args=smac_scenario_args, ) automl.fit(X_train, y_train, X_test=X_test, y_test=y_test, metric=metric, dataset_name=dataset_name) # print(automl.cv_results_) return automl.cv_results_
def test_conversion_of_list_to_np(self, fit_ensemble, refit, fit): automl = AutoSklearnRegressor() X = [[1], [2], [3]] y = [1, 2, 3] automl.fit(X, y) self.assertEqual(fit.call_count, 1) self.assertIsInstance(fit.call_args[0][0], np.ndarray) self.assertIsInstance(fit.call_args[0][1], np.ndarray) automl.refit(X, y) self.assertEqual(refit.call_count, 1) self.assertIsInstance(refit.call_args[0][0], np.ndarray) self.assertIsInstance(refit.call_args[0][1], np.ndarray) automl.fit_ensemble(y) self.assertEqual(fit_ensemble.call_count, 1) self.assertIsInstance(fit_ensemble.call_args[0][0], np.ndarray)
def test_conversion_of_list_to_np(self, fit_ensemble, refit, fit): automl = AutoSklearnRegressor() X = [[1], [2], [3]] y = [1, 2, 3] automl.fit(X, y) self.assertEqual(fit.call_count, 1) self.assertIsInstance(fit.call_args[0][0], np.ndarray) self.assertIsInstance(fit.call_args[0][1], np.ndarray) automl.refit(X, y) self.assertEqual(refit.call_count, 1) self.assertIsInstance(refit.call_args[0][0], np.ndarray) self.assertIsInstance(refit.call_args[0][1], np.ndarray) automl.fit_ensemble(y) self.assertEqual(fit_ensemble.call_count, 1) self.assertIsInstance(fit_ensemble.call_args[0][0], np.ndarray)
def test_regression(self): output = os.path.join(self.test_dir, '..', '.tmp_regression_fit') self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor(time_left_for_this_task=20, per_run_time_limit=5, tmp_folder=output, output_folder=output) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (356, )) score = mean_squared_error(Y_test, predictions) # On average np.sqrt(30) away from the target -> ~5.5 on average self.assertGreaterEqual(score, -30)
def test_regression(self): tmp = os.path.join(self.test_dir, '..', '.tmp_regression_fit') output = os.path.join(self.test_dir, '..', '.out_regression_fit') self._setUp(tmp) self._setUp(output) X_train, Y_train, X_test, Y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor(time_left_for_this_task=20, per_run_time_limit=5, tmp_folder=tmp, output_folder=output) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) self.assertEqual(predictions.shape, (356,)) score = mean_squared_error(Y_test, predictions) # On average np.sqrt(30) away from the target -> ~5.5 on average self.assertGreaterEqual(score, -30)
def train_regression(): dump_file = os.path.join( AUTO_ML_MODELS_PATH, 'auto_sklearn_regressor' + str(datetime.datetime.now()) + '.dump') features, outcome_slave, _ = file_loader('c99temp_train.snappy.csv') features = features.values outcome_slave = outcome_slave['tempBoardSLAVE'].values model = AutoSklearnRegressor( time_left_for_this_task=3600, per_run_time_limit=600, ) model.fit(features, outcome_slave) with open(dump_file, 'wb') as f: pickle.dump(model, f)
class AutoML(AbstractModel): def __init__(self): super().__init__() self.model = AutoSklearnRegressor def fit(self, x, y, modeldict=None): if not self.m: self.param_search(x, y) self.m.refit(x, y) def param_search(self, x, y, time_per_sample=3.5, **kwargs): time = int(len(y) * time_per_sample) self.m = AutoSklearnRegressor( time_left_for_this_task=time, resampling_strategy="cv", resampling_strategy_arguments={'folds': 10}) self.m.fit(x, y, metric=mean_squared_error, dataset_name="Land Use Regression") # print(self.m.sprint_statistics()) # score = score_funtion(y, self.m.predict(x)) # print("Reached a score of {}.".format(score)) kf = KFold(n_splits=10, shuffle=True) rmse = [] mae = [] r2 = [] for train_index, test_index in kf.split(x, y): X_train, X_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] self.m.refit(X_train, y_train) predictions = self.m.predict(X_test) rmse_iter, mae_iter, r2_iter = self.score_function( y_test, predictions) rmse.append(rmse_iter) mae.append(mae_iter) r2.append(r2_iter) # print("Reached a RMSE of {}, MAE of {} and R2 of {}.".format(np.mean(rmse), np.mean(mae), np.mean(r2))) return self.concat_results(np.mean(rmse), np.mean(mae), np.mean(r2))
def test_regression(tmp_dir, output_dir, dask_client): X_train, Y_train, X_test, Y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor(time_left_for_this_task=30, per_run_time_limit=5, tmp_folder=tmp_dir, dask_client=dask_client, output_folder=output_dir) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) assert predictions.shape == (356, ) score = mean_squared_error(Y_test, predictions) # On average np.sqrt(30) away from the target -> ~5.5 on average # Results with select rates drops avg score to a range of -32.40 to -37, on 30 seconds # constraint. With more time_left_for_this_task this is no longer an issue assert score >= -37, print_debug_information(automl) assert count_succeses(automl.cv_results_) > 0
def test_regression_methods_returns_self(self): X_train, y_train, X_test, y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor(time_left_for_this_task=20, per_run_time_limit=5, ensemble_size=0) automl_fitted = automl.fit(X_train, y_train) self.assertIs(automl, automl_fitted) automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5) self.assertIs(automl, automl_ensemble_fitted) automl_refitted = automl.refit(X_train.copy(), y_train.copy()) self.assertIs(automl, automl_refitted)
def test_cv_regression(tmp_dir, output_dir, dask_client): """ Makes sure that when using a cv strategy, we are able to fit a regressor """ X_train, Y_train, X_test, Y_test = putil.get_dataset( 'boston', train_size_maximum=300) automl = AutoSklearnRegressor(time_left_for_this_task=60, per_run_time_limit=10, resampling_strategy='cv', tmp_folder=tmp_dir, dask_client=dask_client, output_folder=output_dir) automl.fit(X_train, Y_train) predictions = automl.predict(X_test) assert predictions.shape == (206, ) score = r2(Y_test, predictions) assert score >= 0.1, print_debug_information(automl) assert count_succeses( automl.cv_results_) > 0, print_debug_information(automl)
def test_regression_methods_returns_self(self): X_train, y_train, X_test, y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor(time_left_for_this_task=20, per_run_time_limit=5, ensemble_size=0) automl_fitted = automl.fit(X_train, y_train) self.assertIs(automl, automl_fitted) automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5) self.assertIs(automl, automl_ensemble_fitted) automl_refitted = automl.refit(X_train.copy(), y_train.copy()) self.assertIs(automl, automl_refitted)
def test_autosklearn_regression_methods_returns_self(dask_client): X_train, y_train, X_test, y_test = putil.get_dataset('boston') automl = AutoSklearnRegressor(time_left_for_this_task=30, per_run_time_limit=5, dask_client=dask_client, ensemble_size=0) automl_fitted = automl.fit(X_train, y_train) assert automl is automl_fitted automl_ensemble_fitted = automl.fit_ensemble(y_train, ensemble_size=5) assert automl is automl_ensemble_fitted automl_refitted = automl.refit(X_train.copy(), y_train.copy()) assert automl is automl_refitted
def test_type_of_target(mock_estimator): # Test that classifier raises error for illegal target types. X = np.array([ [1, 2], [2, 3], [3, 4], [4, 5], ]) # Possible target types y_binary = np.array([0, 0, 1, 1]) y_continuous = np.array([0.1, 1.3, 2.1, 4.0]) y_multiclass = np.array([0, 1, 2, 0]) y_multilabel = np.array([ [0, 1], [1, 1], [1, 0], [0, 0], ]) y_multiclass_multioutput = np.array([ [0, 1], [1, 3], [2, 2], [5, 3], ]) y_continuous_multioutput = np.array([ [0.1, 1.5], [1.2, 3.5], [2.7, 2.7], [5.5, 3.9], ]) cls = AutoSklearnClassifier(ensemble_size=0) cls.automl_ = unittest.mock.Mock() cls.automl_.InputValidator = unittest.mock.Mock() cls.automl_.InputValidator.target_validator = unittest.mock.Mock() # Illegal target types for classification: continuous, # multiclass-multioutput, continuous-multioutput. expected_msg = r".*Classification with data of type" " multiclass-multioutput is not supported.*" with pytest.raises(ValueError, match=expected_msg): cls.fit(X=X, y=y_multiclass_multioutput) expected_msg = r".*Classification with data of type" " continuous is not supported.*" with pytest.raises(ValueError, match=expected_msg): cls.fit(X=X, y=y_continuous) expected_msg = r".*Classification with data of type" " continuous-multioutput is not supported.*" with pytest.raises(ValueError, match=expected_msg): cls.fit(X=X, y=y_continuous_multioutput) # Legal target types for classification: binary, multiclass, # multilabel-indicator. try: cls.fit(X, y_binary) except ValueError: pytest.fail("cls.fit() raised ValueError while fitting " "binary targets") try: cls.fit(X, y_multiclass) except ValueError: pytest.fail("cls.fit() raised ValueError while fitting " "multiclass targets") try: cls.fit(X, y_multilabel) except ValueError: pytest.fail("cls.fit() raised ValueError while fitting " "multilabel-indicator targets") # Test that regressor raises error for illegal target types. reg = AutoSklearnRegressor(ensemble_size=0) # Illegal target types for regression: multilabel-indicator # multiclass-multioutput expected_msg = r".*Regression with data of type" " multilabel-indicator is not supported.*" with pytest.raises(ValueError, match=expected_msg): reg.fit( X=X, y=y_multilabel, ) expected_msg = r".*Regression with data of type" " multiclass-multioutput is not supported.*" with pytest.raises(ValueError, match=expected_msg): reg.fit( X=X, y=y_multiclass_multioutput, ) # Legal target types: continuous, multiclass, # continuous-multioutput, # binary try: reg.fit(X, y_continuous) except ValueError: pytest.fail("reg.fit() raised ValueError while fitting " "continuous targets") try: reg.fit(X, y_multiclass) except ValueError: pytest.fail("reg.fit() raised ValueError while fitting " "multiclass targets") try: reg.fit(X, y_continuous_multioutput) except ValueError: pytest.fail("reg.fit() raised ValueError while fitting " "continuous_multioutput targets") try: reg.fit(X, y_binary) except ValueError: pytest.fail("reg.fit() raised ValueError while fitting " "binary targets")
# ensemble_size=1, initial_configurations_via_metalearning=0, # resampling_strategy_arguments={'folds': 5}) #-----REGRESSION----- automl = AutoSklearnRegressor( per_run_time_limit=360, ml_memory_limit=1024 * 8, time_left_for_this_task=3600, resampling_strategy='cv', # ensemble_size=1, # initial_configurations_via_metalearning=0, resampling_strategy_arguments={'folds': 5}) start = time.time() #X_train = X_train.astype('float') # when? automl.fit(X_train, y_train, dataset_name='boston_housing') #change dataset name accordingly automl.refit(X_train.copy(), y_train.copy()) print( '[INFO] Elapsed time finding best model: {} seconds.'.format(time.time() - start)) predictions = automl.predict(X_test) #print('--- CLASSIFICATION REPORT: ---') #not for regression #print(classification_report(y_test, predictions, digits=5)) print('\n\n--- MODELS: ---') print(automl.show_models()) print('\n\n--- STATISTICS: ---') print(automl.sprint_statistics()) #-----CLASSIFIER----- #print('\n\n--- SCORE: ---')
dataframe = read_csv(address) print(time.strftime("Start time is %Y-%m-%d %H:%M:%S", time.localtime())) # split into input and output elements data = dataframe.values data = data.astype('int') X, y = data[:, :-1], data[:, -1] print(X.shape, y.shape) # split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) # define search model = AutoSklearnRegressor(time_left_for_this_task=5 * 60, per_run_time_limit=30, n_jobs=8) # perform the search model.fit(X_train, y_train) # summarize # print(model.sprint_statistics()) # evaluate best model y_hat = model.predict(X_test) mae = mean_absolute_error(y_test, y_hat) r2Score = r2_score(y_test, y_hat) mape = mean_absolute_percentage_error(y_test, y_hat) mse = mean_squared_error(y_test, y_hat) print(time.strftime("End time is %Y-%m-%d %H:%M:%S", time.localtime())) print("MAE: %.3f" % mae) print("R2_score: %.3f" % r2Score) print("MAPE: %.3f" % mape) print("MSE: %.3f" % mse)
if __name__ == "__main__": import numpy as np from autosklearn.regression import AutoSklearnRegressor from autosklearn.metrics import mean_squared_error import pickle #load X, y _file = open('data_BA.pkl', 'rb') X, y = pickle.load(_file) _file.close() #autosklearn regr = AutoSklearnRegressor(time_left_for_this_task=172800, per_run_time_limit = 600, resampling_strategy='cv', resampling_strategy_arguments={'folds': 4}, metric=mean_squared_error, n_jobs=2, ) regr.fit(X, y) #pickle best regressor _file = open('Autoskl_bestmodel.pkl', "wb") pickle.dump(regr, _file) _file.close()
def test_type_of_target(self, mock_estimator): # Test that classifier raises error for illegal target types. X = np.array([ [1, 2], [2, 3], [3, 4], [4, 5], ]) # Possible target types y_binary = np.array([0, 0, 1, 1]) y_continuous = np.array([0.1, 1.3, 2.1, 4.0]) y_multiclass = np.array([0, 1, 2, 0]) y_multilabel = np.array([ [0, 1], [1, 1], [1, 0], [0, 0], ]) y_multiclass_multioutput = np.array([ [0, 1], [1, 3], [2, 2], [5, 3], ]) y_continuous_multioutput = np.array([ [0.1, 1.5], [1.2, 3.5], [2.7, 2.7], [5.5, 3.9], ]) cls = AutoSklearnClassifier() # Illegal target types for classification: continuous, # multiclass-multioutput, continuous-multioutput. self.assertRaisesRegex( ValueError, "classification with data of type" " multiclass-multioutput is not supported", cls.fit, X=X, y=y_multiclass_multioutput, ) self.assertRaisesRegex( ValueError, "classification with data of type" " continuous is not supported", cls.fit, X=X, y=y_continuous, ) self.assertRaisesRegex( ValueError, "classification with data of type" " continuous-multioutput is not supported", cls.fit, X=X, y=y_continuous_multioutput, ) # Legal target types for classification: binary, multiclass, # multilabel-indicator. try: cls.fit(X, y_binary) except ValueError: self.fail("cls.fit() raised ValueError while fitting " "binary targets") try: cls.fit(X, y_multiclass) except ValueError: self.fail("cls.fit() raised ValueError while fitting " "multiclass targets") try: cls.fit(X, y_multilabel) except ValueError: self.fail("cls.fit() raised ValueError while fitting " "multilabel-indicator targets") # Test that regressor raises error for illegal target types. reg = AutoSklearnRegressor() # Illegal target types for regression: multiclass-multioutput, # multilabel-indicator, continuous-multioutput. self.assertRaisesRegex( ValueError, "regression with data of type" " multiclass-multioutput is not supported", reg.fit, X=X, y=y_multiclass_multioutput, ) self.assertRaisesRegex( ValueError, "regression with data of type" " multilabel-indicator is not supported", reg.fit, X=X, y=y_multilabel, ) self.assertRaisesRegex( ValueError, "regression with data of type" " continuous-multioutput is not supported", reg.fit, X=X, y=y_continuous_multioutput, ) # Legal target types: continuous, binary, multiclass try: reg.fit(X, y_continuous) except ValueError: self.fail("reg.fit() raised ValueError while fitting " "continuous targets") try: reg.fit(X, y_binary) except ValueError: self.fail("reg.fit() raised ValueError while fitting " "binary targets") try: reg.fit(X, y_multiclass) except ValueError: self.fail("reg.fit() raised ValueError while fitting " "multiclass targets")
output_folder=outpath + 'output_folder', ) elif ml_type == 'classification': model = AutoSklearnClassifier( time_left_for_this_task=time_left_for_this_task, per_run_time_limit=per_run_time_limit, n_jobs=1, memory_limit=1000000, tmp_folder=outpath + 'log_folder', output_folder=outpath + 'output_folder', ) print("start searching") # perform the search model.fit(X_train, y_train, dataset_name=ml_type + '_t' + str(time_left_for_this_task) + '_lead' + str(l)) # summarize file = open( 'log_files/' + ml_type + '_t' + str(time_left_for_this_task) + '_lead' + str(l) + '.txt', 'w') file.write(model.sprint_statistics()) file.write('\n') file.write(model.show_models()) file.close() print(model.sprint_statistics()) print(model.show_models()) # evaluate best model
def offset_col_x_days(df, col, days): for x in range(1, days): df[f'{col}_prev_{x}'] = df[col].shift(x) df = df.dropna().reset_index(drop=True) return df for i in range(len(stations)): stations[i] = offset_col_x_days(stations[i], 'energy', 7) stations[0] # In[67]: X = stations[0] y = X['energy'] X = X.drop(['name', 'date', 'energy'], axis=1) train_test_split = int(len(X) * 0.8) X_train, X_test = X[:train_test_split], X[train_test_split:] y_train, y_test = y[:train_test_split], y[train_test_split:] from autosklearn.regression import AutoSklearnRegressor automl = AutoSklearnRegressor() automl.fit(X_train, y_train) print(automl.show_models()) predictions = automl.predict(X_test) print("R2 score:", sklearn.metrics.r2_score(y_test, predictions))
n_features=10, n_informative=5, n_targets=3) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) ############################################################################ # Build and fit a regressor # ========================= automl = AutoSklearnRegressor( time_left_for_this_task=120, per_run_time_limit=30, tmp_folder='/tmp/autosklearn_multioutput_regression_example_tmp', ) automl.fit(X_train, y_train, dataset_name='synthetic') ############################################################################ # View the models found by auto-sklearn # ===================================== print(automl.leaderboard()) ############################################################################ # Print the final ensemble constructed by auto-sklearn # ==================================================== print(automl.show_models()) ########################################################################### # Get the Score of the final ensemble
class AutoSklearnSolver: """ Model implementing through auto-sklearn. https://github.com/automl/auto-sklearn Класс реализует работу модели через функциональность auto-sklearn. Args: model_dir: Путь к директории модели time_limit: Временной лимит на обучение модели (с) memory_limit: Лимит на объем используемой памяти (Мб) Attributes: model_dir (str): Путь к каталогу модели config (Config): Параметры модели model ([AutoSklearnClassifier, AutoSklearnRegressor]): Объект модели auto-sklearn per_run_time_limit (int): Временной лимит на обучение модели metrics_object (autosklearn.metrics): Объект метрики качества содели procesed_data_path (str): Путь сохранения обработанных данных """ def __init__(self, model_dir: str, time_limit: int = 0, memory_limit: int = 0) -> None: os.makedirs(model_dir, exist_ok=True) self.model_dir = model_dir self.config = Config(model_dir, time_limit, memory_limit) self.model = None self.per_run_time_limit = min(360, time_limit // 2) @time_logging def fit(self, train_csv: str, mode: str, metrics_name: str, save_processed_data: bool) -> None: """Start model fitting Запуск процесса обучения модели Args: train_csv: Путь к обучающему датасету mode: Режим работы (классификация или регрессия) metrics_name: Имя объекта метрики качества в модуле autosklearn.metrics save_processed_data: Флаг сохранения датасета с обработанными данными """ if not os.path.exists(train_csv): log('Data file {} is not exist!'.format(train_csv)) return # получение объекта метрики try: self.metrics_object = getattr(metrics, metrics_name) except AttributeError as error: self.metrics_object = None log('Can\'t get the metrics object!') log('{}: {}'.format(type(error).__name__, error)) return # подготовка каталога для сохранения данных if save_processed_data: self.procesed_data_path = os.path.join(self.model_dir, 'processed_data') os.makedirs(self.procesed_data_path, exist_ok=True) self.config['task'] = 'fit' self.config['mode'] = mode self.config['tmp_dir'] = self.config['model_dir'] + '/tmp' # удаление временной директории # (auto-sklearn ругается перед началом работы, если этого не делать) shutil.rmtree(self.config['tmp_dir'], ignore_errors=True) # первичный анализ, чтение данных, разбитие на матрицы X и y df = read_df(train_csv, self.config) y = df['target'] X = df.drop('target', axis=1) # обработка данных process_dataframe(X, self.config) if save_processed_data: log('Saving processed data') X.to_csv(os.path.join(self.procesed_data_path, 'X.csv')) y.to_csv(os.path.join(self.procesed_data_path, 'y.csv')) # параметры создаваемой auto-sklearn модели # (выключаем препроцессинг, т.к. он уже проведен) model_params = { 'time_left_for_this_task': self.config.time_left(), 'per_run_time_limit': self.per_run_time_limit, 'ml_memory_limit': self.config['memory_limit'], 'tmp_folder': self.config['tmp_dir'], 'include_preprocessors': ['no_preprocessing'], 'delete_tmp_folder_after_terminate': True } # инициализация объекта модели self.model_init(model_params) # обучение модели self.model_fit(X, y, self.metrics_object) log('model_fitted: {}'.format(type(self.model))) log('autosklearn model contains:') log(self.model.show_models()) @time_logging def model_init(self, model_params: Dict[str, Any]) -> None: """Model initialization Инициализация объекта модели в зависимости от типа задачи Args: model_params: Словарь параметров модели """ if self.config['mode'] == 'classification': self.model = AutoSklearnClassifier(**model_params) elif self.config['mode'] == 'regression': self.model = AutoSklearnRegressor(**model_params) @time_logging def model_fit(self, X: pd.DataFrame, y: pd.Series, metrics: Callable) -> None: """Model fitting wrapper Обертка для вызова fit (для учета времени в логе) Args: X: Матрица признаков y: Вектор ответов metrics: Объект метрики качества """ # подавляем вывод предупреждений в лог warnings.filterwarnings('ignore', category=FutureWarning) warnings.filterwarnings('ignore', category=RuntimeWarning) self.model.fit(X, y, metric=metrics) warnings.resetwarnings() @time_logging def predict(self, test_csv: str, prediction_csv: str, validation_csv: str, need_proba: bool) -> pd.DataFrame: """Start model prediction Запуск процесса предсказывания целевого признака на новых данных Args: test_csv: Путь к тестовому датасету prediction_csv: Путь для записи ответов модели validation_csv: Путь к датасету правильных ответов на тестовой выборке (для подсчета метрики) need_proba: Флаг необходимости выдавать вероятностные предсказания Returns: Датасет с ответами модели """ if not os.path.exists(test_csv): log('Data file {} is not exist!'.format(test_csv)) return self.config['task'] = 'predict' df = read_df(test_csv, self.config) process_dataframe(df, self.config) predictions_df = self.model_predict(df, prediction_csv, need_proba) if validation_csv != 'None': self.model_validate(predictions_df, validation_csv) @time_logging def model_predict(self, X: pd.DataFrame, prediction_csv: str, need_proba: bool) -> pd.DataFrame: """Model predict wrapper Обертка для вызова predict Args: X: Матрица признаков prediction_csv: Путь для записи ответов модели need_proba: Флаг необходимости выдавать вероятностные предсказания """ if (self.config['mode'] == 'classification') and need_proba: predictions = self.model.predict_proba(X, n_jobs=-1) df_columns = ['target_0', 'target_1'] else: predictions = self.model.predict(X, n_jobs=-1) df_columns = ['target'] # подготовка каталога для записи ответов output_dir = '/'.join(prediction_csv.split('/')[:-1]) os.makedirs(output_dir, exist_ok=True) # запись датафрейма с ответами predictions_df = pd.DataFrame(predictions, index=X.index, columns=df_columns) predictions_df.to_csv(prediction_csv) return predictions_df @time_logging def model_validate(self, predictions_df: pd.DataFrame, validation_csv: str) -> None: """Model validate Валидирование модели по известным правильным ответам Args: prediction_csv: Путь для записи ответов модели validation_csv: Путь к датасету правильных ответов на тестовой выборке """ if self.metrics_object is None: log('Can\'t get the metrics object!') return if not os.path.exists(validation_csv): log('Validation file {} is not exist!'.format(validation_csv)) return # чтение датасета с правильными ответами validation_df = pd.read_csv(validation_csv, encoding='utf-8', sep=',') # объединение правильных и предсказанных ответов для соответствия по индексам compare_df = pd.merge(validation_df, predictions_df, on="line_id") # подсчет score # в объединенном датасете будут следующий индексы столбцов: # 0: index, 1: true values, 2-...: predicted values score = self.metrics_object(compare_df.iloc[:, 1].values, compare_df.iloc[:, 2:].values) log('Metrics: {}'.format(self.metrics_object)) log('Score: {}'.format(score)) return score @time_logging def save(self) -> None: """Save model, parameters and metrics object Сохранение на диск модели, параметров и объекта метрики """ self.config.save() with open(os.path.join(self.config['model_dir'], 'model.pkl'), 'wb') as f: pickle.dump(self.model, f, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join(self.config['model_dir'], 'metrics_object.pkl'), 'wb') as f: pickle.dump(self.metrics_object, f, protocol=pickle.HIGHEST_PROTOCOL) @time_logging def load(self) -> None: """Load model, parameters and metrics object Загрузка с диска модели, параметров и объекта метрики """ self.config.load() with open(os.path.join(self.config['model_dir'], 'model.pkl'), 'rb') as f: self.model = pickle.load(f) with open(os.path.join(self.config['model_dir'], 'metrics_object.pkl'), 'rb') as f: self.metrics_object = pickle.load(f) def __repr__(self) -> str: repr_string = 'AutoSklearnSolver\n' repr_string += '-----------------\n' repr_string += str(self.config) return repr_string
print(f'[INFO] Train shape: {X_train.shape}') print(f'[INFO] Test shape: {X_test.shape}') print('[INFO] Finding best model...') #for auto vanilla, add this : #ensemble_size=1, initial_configurations_via_metalearning=0 #-----CLASSIFIER----- #automl = AutoSklearnClassifier(per_run_time_limit=300, ml_memory_limit=1024 * 6, time_left_for_this_task=3600, resampling_strategy='cv', # resampling_strategy_arguments={'folds': 5}) #-----REGRESSION----- automl = AutoSklearnRegressor(per_run_time_limit=300, ml_memory_limit=1024 * 6, time_left_for_this_task=3600, resampling_strategy='cv', resampling_strategy_arguments={'folds': 5}) start = time.time() #X_train = X_train.astype('float') automl.fit(X_train, y_train, dataset_name='linnerud') #change dataset name accordingly automl.refit(X_train.copy(), y_train.copy()) print(f'[INFO] Elapsed time finding best model: {time.time() - start} seconds.') predictions = automl.predict(X_test) #print('--- CLASSIFICATION REPORT: ---') #not for regression #print(classification_report(y_test, predictions, digits=5)) print('\n\n--- MODELS: ---') print(automl.show_models()) print('\n\n--- STATISTICS: ---') print(automl.sprint_statistics()) #-----CLASSIFIER----- #print('\n\n--- SCORE: ---') #print("Balanced error score", 1 - balanced_accuracy_score(y_test, predictions)) #-----REGRESSION-----
] preprocessing_to_use = ["no_preprocessing"] # Init auto-sklearn auto_sklearn = AutoSklearnRegressor(time_left_for_this_task=60 * 5, per_run_time_limit=360, include_estimators=estimators_to_use, exclude_estimators=None, include_preprocessors=preprocessing_to_use, exclude_preprocessors=None, ml_memory_limit=6156, resampling_strategy="cv", resampling_strategy_arguments={"folds": 5}) # Train models auto_sklearn.fit(X=X_train.copy(), y=y_train.copy(), metric=mean_squared_error) it_fits = auto_sklearn.refit(X=X_train.copy(), y=y_train.copy()) # Predict y_hat = auto_sklearn.predict(X_test) # Show results auto_sklearn.cv_results_ auto_sklearn.sprint_statistics() auto_sklearn.show_models() auto_sklearn.get_models_with_weights() # TPOT from tpot import TPOTRegressor